diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..9d515d3d5e4e2fc70209a0fa148587f181e4b55e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +codegemma_instruct_cot_ft_lora_r64_alpha64/tokenizer.json filter=lfs diff=lfs merge=lfs -text +codegemma_instruct_cot_ft_lora_r64_alpha64/**/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/README.md b/codegemma_instruct_cot_ft_lora_r64_alpha64/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0d77d70fdc5c829c8889cb85828736b7eb9714 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/codegemma-7b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/adapter_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e841602c6a59fc7b085ac647af4d4c312445d261 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/codegemma-7b-it", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "q_proj", + "down_proj", + "gate_proj", + "k_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/adapter_model.safetensors b/codegemma_instruct_cot_ft_lora_r64_alpha64/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..de27ccddf1ad082a43332935be888e13539a266a --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b9a683aab0635490d46fb72d7c531a3a443ffe4435c539ad2d7f933ae1397d5 +size 800116456 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/README.md b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0d77d70fdc5c829c8889cb85828736b7eb9714 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/codegemma-7b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/adapter_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e841602c6a59fc7b085ac647af4d4c312445d261 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/codegemma-7b-it", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "q_proj", + "down_proj", + "gate_proj", + "k_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/adapter_model.safetensors b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a710100266d0b81475da9c4aa217212d33b8c1a8 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64e267050be1a43e3b401da7e45c66497974e0eb05f8f8a684c91b23aad584c4 +size 800116456 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/optimizer.pt b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5272921ecb85c383a3e560ab9b1ccde658f9a853 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5ec055e0ecb1d77f399dcf0d034378575853e437b45a99193e4df6760dc771c +size 406743412 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/rng_state.pth b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f8d20ff01271ccf633065ed8e4601c7b74d10586 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:203e9bfabd925cb4ec7129d24877156fcee87215187c35a867e358e56a9425a4 +size 14244 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/scheduler.pt b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a88e58e815c713ae42565785071d8f370c2db949 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd58b39bd6bfb59bb506313d8c2a30b14e9dc11a2d99de48bc99568a9510416f +size 1064 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/special_tokens_map.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/tokenizer.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..45a5e23f54141c5f4f97a8d58f3ffadc28e287ba --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d964a2c8346d40f95791533eae48730d5f163c2e65fd16333560fd3e661df318 +size 34362915 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/tokenizer.model b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..71a98ce40269d847e58957e1e070d9ae8eb184af --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:583f2ebd2a1936009b7da991ea255504db68c7a9713a78673d1335a87098966c +size 4241023 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/tokenizer_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9b9b1b4acdd4afcedae39d1cf6f0bc7ef7d9910f --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/tokenizer_config.json @@ -0,0 +1,2011 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "<|file_separator|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "left", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/trainer_state.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..aa7a441a27c252b96fba96e45454925ebbf2e1b2 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/trainer_state.json @@ -0,0 +1,742 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2058142526369951, + "eval_steps": 100, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0020581425263699513, + "grad_norm": 11.994463920593262, + "learning_rate": 2.061855670103093e-07, + "loss": 2.91, + "step": 1 + }, + { + "epoch": 0.004116285052739903, + "grad_norm": 11.769092559814453, + "learning_rate": 4.123711340206186e-07, + "loss": 2.8686, + "step": 2 + }, + { + "epoch": 0.0061744275791098535, + "grad_norm": 13.05551815032959, + "learning_rate": 6.185567010309279e-07, + "loss": 3.0286, + "step": 3 + }, + { + "epoch": 0.008232570105479805, + "grad_norm": 12.334521293640137, + "learning_rate": 8.247422680412372e-07, + "loss": 2.904, + "step": 4 + }, + { + "epoch": 0.010290712631849755, + "grad_norm": 12.075353622436523, + "learning_rate": 1.0309278350515464e-06, + "loss": 2.8991, + "step": 5 + }, + { + "epoch": 0.012348855158219707, + "grad_norm": 11.86032485961914, + "learning_rate": 1.2371134020618557e-06, + "loss": 3.0007, + "step": 6 + }, + { + "epoch": 0.014406997684589657, + "grad_norm": 10.10457992553711, + "learning_rate": 1.4432989690721649e-06, + "loss": 2.8493, + "step": 7 + }, + { + "epoch": 0.01646514021095961, + "grad_norm": 8.56408405303955, + "learning_rate": 1.6494845360824744e-06, + "loss": 2.9573, + "step": 8 + }, + { + "epoch": 0.01852328273732956, + "grad_norm": 6.307392120361328, + "learning_rate": 1.8556701030927837e-06, + "loss": 2.9507, + "step": 9 + }, + { + "epoch": 0.02058142526369951, + "grad_norm": 4.276430130004883, + "learning_rate": 2.061855670103093e-06, + "loss": 2.8988, + "step": 10 + }, + { + "epoch": 0.022639567790069464, + "grad_norm": 2.5912015438079834, + "learning_rate": 2.268041237113402e-06, + "loss": 2.9926, + "step": 11 + }, + { + "epoch": 0.024697710316439414, + "grad_norm": 2.018446207046509, + "learning_rate": 2.4742268041237115e-06, + "loss": 2.9874, + "step": 12 + }, + { + "epoch": 0.026755852842809364, + "grad_norm": 1.8558588027954102, + "learning_rate": 2.680412371134021e-06, + "loss": 2.8608, + "step": 13 + }, + { + "epoch": 0.028813995369179314, + "grad_norm": 1.9658265113830566, + "learning_rate": 2.8865979381443297e-06, + "loss": 2.8596, + "step": 14 + }, + { + "epoch": 0.030872137895549268, + "grad_norm": 1.872044563293457, + "learning_rate": 3.0927835051546395e-06, + "loss": 2.8836, + "step": 15 + }, + { + "epoch": 0.03293028042191922, + "grad_norm": 1.8884096145629883, + "learning_rate": 3.298969072164949e-06, + "loss": 2.9383, + "step": 16 + }, + { + "epoch": 0.03498842294828917, + "grad_norm": 1.8795744180679321, + "learning_rate": 3.5051546391752577e-06, + "loss": 2.883, + "step": 17 + }, + { + "epoch": 0.03704656547465912, + "grad_norm": 1.783678412437439, + "learning_rate": 3.7113402061855674e-06, + "loss": 2.8019, + "step": 18 + }, + { + "epoch": 0.039104708001029075, + "grad_norm": 1.820617914199829, + "learning_rate": 3.917525773195877e-06, + "loss": 2.8813, + "step": 19 + }, + { + "epoch": 0.04116285052739902, + "grad_norm": 1.8188731670379639, + "learning_rate": 4.123711340206186e-06, + "loss": 2.8401, + "step": 20 + }, + { + "epoch": 0.043220993053768975, + "grad_norm": 1.7305251359939575, + "learning_rate": 4.329896907216495e-06, + "loss": 2.7478, + "step": 21 + }, + { + "epoch": 0.04527913558013893, + "grad_norm": 1.7014551162719727, + "learning_rate": 4.536082474226804e-06, + "loss": 2.7356, + "step": 22 + }, + { + "epoch": 0.047337278106508875, + "grad_norm": 1.677381157875061, + "learning_rate": 4.742268041237113e-06, + "loss": 2.7593, + "step": 23 + }, + { + "epoch": 0.04939542063287883, + "grad_norm": 1.628554344177246, + "learning_rate": 4.948453608247423e-06, + "loss": 2.7689, + "step": 24 + }, + { + "epoch": 0.051453563159248775, + "grad_norm": 1.4968128204345703, + "learning_rate": 5.154639175257732e-06, + "loss": 2.6613, + "step": 25 + }, + { + "epoch": 0.05351170568561873, + "grad_norm": 1.4734832048416138, + "learning_rate": 5.360824742268042e-06, + "loss": 2.7095, + "step": 26 + }, + { + "epoch": 0.05556984821198868, + "grad_norm": 1.3745571374893188, + "learning_rate": 5.567010309278351e-06, + "loss": 2.655, + "step": 27 + }, + { + "epoch": 0.05762799073835863, + "grad_norm": 1.3381729125976562, + "learning_rate": 5.7731958762886594e-06, + "loss": 2.55, + "step": 28 + }, + { + "epoch": 0.05968613326472858, + "grad_norm": 1.3388073444366455, + "learning_rate": 5.979381443298969e-06, + "loss": 2.5219, + "step": 29 + }, + { + "epoch": 0.061744275791098535, + "grad_norm": 1.317008376121521, + "learning_rate": 6.185567010309279e-06, + "loss": 2.4491, + "step": 30 + }, + { + "epoch": 0.06380241831746848, + "grad_norm": 1.3210794925689697, + "learning_rate": 6.391752577319588e-06, + "loss": 2.4358, + "step": 31 + }, + { + "epoch": 0.06586056084383844, + "grad_norm": 1.182519555091858, + "learning_rate": 6.597938144329898e-06, + "loss": 2.4514, + "step": 32 + }, + { + "epoch": 0.06791870337020839, + "grad_norm": 1.2238099575042725, + "learning_rate": 6.804123711340207e-06, + "loss": 2.442, + "step": 33 + }, + { + "epoch": 0.06997684589657834, + "grad_norm": 1.1793314218521118, + "learning_rate": 7.010309278350515e-06, + "loss": 2.3864, + "step": 34 + }, + { + "epoch": 0.0720349884229483, + "grad_norm": 1.1983020305633545, + "learning_rate": 7.216494845360825e-06, + "loss": 2.3796, + "step": 35 + }, + { + "epoch": 0.07409313094931824, + "grad_norm": 1.2189652919769287, + "learning_rate": 7.422680412371135e-06, + "loss": 2.4152, + "step": 36 + }, + { + "epoch": 0.07615127347568819, + "grad_norm": 1.14923095703125, + "learning_rate": 7.628865979381444e-06, + "loss": 2.3298, + "step": 37 + }, + { + "epoch": 0.07820941600205815, + "grad_norm": 1.147013545036316, + "learning_rate": 7.835051546391754e-06, + "loss": 2.2488, + "step": 38 + }, + { + "epoch": 0.0802675585284281, + "grad_norm": 1.133981466293335, + "learning_rate": 8.041237113402063e-06, + "loss": 2.1825, + "step": 39 + }, + { + "epoch": 0.08232570105479804, + "grad_norm": 1.1686867475509644, + "learning_rate": 8.247422680412371e-06, + "loss": 2.2282, + "step": 40 + }, + { + "epoch": 0.084383843581168, + "grad_norm": 1.131690502166748, + "learning_rate": 8.453608247422681e-06, + "loss": 2.0962, + "step": 41 + }, + { + "epoch": 0.08644198610753795, + "grad_norm": 1.1626195907592773, + "learning_rate": 8.65979381443299e-06, + "loss": 2.1161, + "step": 42 + }, + { + "epoch": 0.0885001286339079, + "grad_norm": 1.1508581638336182, + "learning_rate": 8.865979381443299e-06, + "loss": 1.9856, + "step": 43 + }, + { + "epoch": 0.09055827116027786, + "grad_norm": 1.2286733388900757, + "learning_rate": 9.072164948453609e-06, + "loss": 2.076, + "step": 44 + }, + { + "epoch": 0.0926164136866478, + "grad_norm": 1.82068932056427, + "learning_rate": 9.278350515463918e-06, + "loss": 1.9995, + "step": 45 + }, + { + "epoch": 0.09467455621301775, + "grad_norm": 2.079101324081421, + "learning_rate": 9.484536082474226e-06, + "loss": 1.9601, + "step": 46 + }, + { + "epoch": 0.0967326987393877, + "grad_norm": 1.1209226846694946, + "learning_rate": 9.690721649484536e-06, + "loss": 1.9346, + "step": 47 + }, + { + "epoch": 0.09879084126575766, + "grad_norm": 1.0579711198806763, + "learning_rate": 9.896907216494846e-06, + "loss": 1.8764, + "step": 48 + }, + { + "epoch": 0.1008489837921276, + "grad_norm": 1.0434011220932007, + "learning_rate": 1.0103092783505156e-05, + "loss": 1.8483, + "step": 49 + }, + { + "epoch": 0.10290712631849755, + "grad_norm": 1.0089991092681885, + "learning_rate": 1.0309278350515464e-05, + "loss": 1.8018, + "step": 50 + }, + { + "epoch": 0.10496526884486751, + "grad_norm": 1.0117324590682983, + "learning_rate": 1.0515463917525775e-05, + "loss": 1.8003, + "step": 51 + }, + { + "epoch": 0.10702341137123746, + "grad_norm": 1.0006697177886963, + "learning_rate": 1.0721649484536083e-05, + "loss": 1.7482, + "step": 52 + }, + { + "epoch": 0.1090815538976074, + "grad_norm": 2.1164329051971436, + "learning_rate": 1.0927835051546391e-05, + "loss": 1.7363, + "step": 53 + }, + { + "epoch": 0.11113969642397736, + "grad_norm": 0.9573502540588379, + "learning_rate": 1.1134020618556703e-05, + "loss": 1.661, + "step": 54 + }, + { + "epoch": 0.11319783895034731, + "grad_norm": 1.0059764385223389, + "learning_rate": 1.134020618556701e-05, + "loss": 1.6979, + "step": 55 + }, + { + "epoch": 0.11525598147671726, + "grad_norm": 0.9719656109809875, + "learning_rate": 1.1546391752577319e-05, + "loss": 1.6318, + "step": 56 + }, + { + "epoch": 0.11731412400308722, + "grad_norm": 1.0024539232254028, + "learning_rate": 1.175257731958763e-05, + "loss": 1.6283, + "step": 57 + }, + { + "epoch": 0.11937226652945716, + "grad_norm": 0.9772456288337708, + "learning_rate": 1.1958762886597938e-05, + "loss": 1.5611, + "step": 58 + }, + { + "epoch": 0.12143040905582711, + "grad_norm": 0.9947625994682312, + "learning_rate": 1.2164948453608248e-05, + "loss": 1.6073, + "step": 59 + }, + { + "epoch": 0.12348855158219707, + "grad_norm": 2.112889051437378, + "learning_rate": 1.2371134020618558e-05, + "loss": 1.6208, + "step": 60 + }, + { + "epoch": 0.12554669410856703, + "grad_norm": 1.0515345335006714, + "learning_rate": 1.2577319587628866e-05, + "loss": 1.569, + "step": 61 + }, + { + "epoch": 0.12760483663493696, + "grad_norm": 1.0782145261764526, + "learning_rate": 1.2783505154639176e-05, + "loss": 1.5097, + "step": 62 + }, + { + "epoch": 0.12966297916130692, + "grad_norm": 1.154104232788086, + "learning_rate": 1.2989690721649485e-05, + "loss": 1.5472, + "step": 63 + }, + { + "epoch": 0.13172112168767688, + "grad_norm": 1.1614656448364258, + "learning_rate": 1.3195876288659795e-05, + "loss": 1.4833, + "step": 64 + }, + { + "epoch": 0.13377926421404682, + "grad_norm": 1.1720911264419556, + "learning_rate": 1.3402061855670103e-05, + "loss": 1.4644, + "step": 65 + }, + { + "epoch": 0.13583740674041678, + "grad_norm": 1.8903896808624268, + "learning_rate": 1.3608247422680415e-05, + "loss": 1.4286, + "step": 66 + }, + { + "epoch": 0.13789554926678674, + "grad_norm": 1.2675013542175293, + "learning_rate": 1.3814432989690723e-05, + "loss": 1.416, + "step": 67 + }, + { + "epoch": 0.13995369179315667, + "grad_norm": 1.266434907913208, + "learning_rate": 1.402061855670103e-05, + "loss": 1.3171, + "step": 68 + }, + { + "epoch": 0.14201183431952663, + "grad_norm": 1.3408889770507812, + "learning_rate": 1.4226804123711342e-05, + "loss": 1.3396, + "step": 69 + }, + { + "epoch": 0.1440699768458966, + "grad_norm": 1.3862446546554565, + "learning_rate": 1.443298969072165e-05, + "loss": 1.2642, + "step": 70 + }, + { + "epoch": 0.14612811937226652, + "grad_norm": 2.110553026199341, + "learning_rate": 1.4639175257731958e-05, + "loss": 1.2593, + "step": 71 + }, + { + "epoch": 0.14818626189863648, + "grad_norm": 1.7017499208450317, + "learning_rate": 1.484536082474227e-05, + "loss": 1.24, + "step": 72 + }, + { + "epoch": 0.15024440442500644, + "grad_norm": 1.9851700067520142, + "learning_rate": 1.5051546391752578e-05, + "loss": 1.2313, + "step": 73 + }, + { + "epoch": 0.15230254695137638, + "grad_norm": 2.009608030319214, + "learning_rate": 1.5257731958762888e-05, + "loss": 1.1281, + "step": 74 + }, + { + "epoch": 0.15436068947774634, + "grad_norm": 2.7587485313415527, + "learning_rate": 1.5463917525773197e-05, + "loss": 1.1248, + "step": 75 + }, + { + "epoch": 0.1564188320041163, + "grad_norm": 2.780954599380493, + "learning_rate": 1.5670103092783507e-05, + "loss": 1.0797, + "step": 76 + }, + { + "epoch": 0.15847697453048623, + "grad_norm": 3.1470866203308105, + "learning_rate": 1.5876288659793813e-05, + "loss": 1.0064, + "step": 77 + }, + { + "epoch": 0.1605351170568562, + "grad_norm": 4.653595447540283, + "learning_rate": 1.6082474226804127e-05, + "loss": 0.9219, + "step": 78 + }, + { + "epoch": 0.16259325958322615, + "grad_norm": 4.157363414764404, + "learning_rate": 1.6288659793814433e-05, + "loss": 0.8709, + "step": 79 + }, + { + "epoch": 0.16465140210959608, + "grad_norm": 4.5814924240112305, + "learning_rate": 1.6494845360824743e-05, + "loss": 0.7693, + "step": 80 + }, + { + "epoch": 0.16670954463596604, + "grad_norm": 5.096139907836914, + "learning_rate": 1.6701030927835052e-05, + "loss": 0.6868, + "step": 81 + }, + { + "epoch": 0.168767687162336, + "grad_norm": 4.858880519866943, + "learning_rate": 1.6907216494845362e-05, + "loss": 0.5971, + "step": 82 + }, + { + "epoch": 0.17082582968870594, + "grad_norm": 4.42564582824707, + "learning_rate": 1.7113402061855672e-05, + "loss": 0.4719, + "step": 83 + }, + { + "epoch": 0.1728839722150759, + "grad_norm": 7.720851421356201, + "learning_rate": 1.731958762886598e-05, + "loss": 0.3943, + "step": 84 + }, + { + "epoch": 0.17494211474144586, + "grad_norm": 0.41923192143440247, + "learning_rate": 1.752577319587629e-05, + "loss": 0.3635, + "step": 85 + }, + { + "epoch": 0.1770002572678158, + "grad_norm": 0.2771846354007721, + "learning_rate": 1.7731958762886598e-05, + "loss": 0.3597, + "step": 86 + }, + { + "epoch": 0.17905839979418575, + "grad_norm": 0.24761857092380524, + "learning_rate": 1.793814432989691e-05, + "loss": 0.3735, + "step": 87 + }, + { + "epoch": 0.1811165423205557, + "grad_norm": 0.23277048766613007, + "learning_rate": 1.8144329896907217e-05, + "loss": 0.3643, + "step": 88 + }, + { + "epoch": 0.18317468484692565, + "grad_norm": 0.22931228578090668, + "learning_rate": 1.8350515463917527e-05, + "loss": 0.3519, + "step": 89 + }, + { + "epoch": 0.1852328273732956, + "grad_norm": 0.20750615000724792, + "learning_rate": 1.8556701030927837e-05, + "loss": 0.3431, + "step": 90 + }, + { + "epoch": 0.18729096989966554, + "grad_norm": 0.2080322951078415, + "learning_rate": 1.8762886597938147e-05, + "loss": 0.3632, + "step": 91 + }, + { + "epoch": 0.1893491124260355, + "grad_norm": 0.20186181366443634, + "learning_rate": 1.8969072164948453e-05, + "loss": 0.3492, + "step": 92 + }, + { + "epoch": 0.19140725495240546, + "grad_norm": 0.19172786176204681, + "learning_rate": 1.9175257731958766e-05, + "loss": 0.3552, + "step": 93 + }, + { + "epoch": 0.1934653974787754, + "grad_norm": 0.1747850626707077, + "learning_rate": 1.9381443298969072e-05, + "loss": 0.3355, + "step": 94 + }, + { + "epoch": 0.19552354000514535, + "grad_norm": 0.196411594748497, + "learning_rate": 1.9587628865979382e-05, + "loss": 0.3271, + "step": 95 + }, + { + "epoch": 0.1975816825315153, + "grad_norm": 0.20063228905200958, + "learning_rate": 1.9793814432989692e-05, + "loss": 0.3351, + "step": 96 + }, + { + "epoch": 0.19963982505788525, + "grad_norm": 0.19240939617156982, + "learning_rate": 2e-05, + "loss": 0.3266, + "step": 97 + }, + { + "epoch": 0.2016979675842552, + "grad_norm": 0.18206572532653809, + "learning_rate": 1.997709049255441e-05, + "loss": 0.3393, + "step": 98 + }, + { + "epoch": 0.20375611011062517, + "grad_norm": 0.20384562015533447, + "learning_rate": 1.9954180985108823e-05, + "loss": 0.3395, + "step": 99 + }, + { + "epoch": 0.2058142526369951, + "grad_norm": 0.19944581389427185, + "learning_rate": 1.9931271477663232e-05, + "loss": 0.3268, + "step": 100 + }, + { + "epoch": 0.2058142526369951, + "eval_loss": 0.3456890285015106, + "eval_runtime": 2114.0178, + "eval_samples_per_second": 3.677, + "eval_steps_per_second": 0.92, + "step": 100 + } + ], + "logging_steps": 1, + "max_steps": 970, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.305891627680727e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/training_args.bin b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5999c7ee9dd10ee9076d748e4757533e635fa832 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee55a11f5a306eb7c39b536fdfe2459bc279e468da50f6adda478c4deffcb812 +size 5688 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/README.md b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0d77d70fdc5c829c8889cb85828736b7eb9714 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/codegemma-7b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/adapter_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e841602c6a59fc7b085ac647af4d4c312445d261 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/codegemma-7b-it", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "q_proj", + "down_proj", + "gate_proj", + "k_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/adapter_model.safetensors b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a9b2f57fef850942575d0395bcabc9f5a072eadf --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97f49550075985c6448d99511eea8c0d140b5aa2a60faa2277a7890044a1b911 +size 800116456 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/optimizer.pt b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e2619f8e7ea912bed695d1c75bddadb21b784012 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a9e27cacef57683319dff6c0d7b7cf66889116f9593d5c58fbafe7bfe73c72e +size 406743412 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/rng_state.pth b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f21f2ff1c1a816463781d51760f8156e041f5979 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4981123ff3cf7bd5b7f76839e90e4776f747ca4c38dcb41876fa010c0dea8b23 +size 14244 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/scheduler.pt b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b81b162449dc76d1876d04b5d4c40c22edd1074c --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83f316812b0c43f9354e615bde4cea4403d3ff2b0ee5dd42dfeae9f021d9e696 +size 1064 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/special_tokens_map.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/tokenizer.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..45a5e23f54141c5f4f97a8d58f3ffadc28e287ba --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d964a2c8346d40f95791533eae48730d5f163c2e65fd16333560fd3e661df318 +size 34362915 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/tokenizer.model b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..71a98ce40269d847e58957e1e070d9ae8eb184af --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:583f2ebd2a1936009b7da991ea255504db68c7a9713a78673d1335a87098966c +size 4241023 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/tokenizer_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9b9b1b4acdd4afcedae39d1cf6f0bc7ef7d9910f --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/tokenizer_config.json @@ -0,0 +1,2011 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "<|file_separator|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "left", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/trainer_state.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..43a20460c7cb9d078c8c6af69a1b1a2f45fbec61 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/trainer_state.json @@ -0,0 +1,1450 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4116285052739902, + "eval_steps": 100, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0020581425263699513, + "grad_norm": 11.994463920593262, + "learning_rate": 2.061855670103093e-07, + "loss": 2.91, + "step": 1 + }, + { + "epoch": 0.004116285052739903, + "grad_norm": 11.769092559814453, + "learning_rate": 4.123711340206186e-07, + "loss": 2.8686, + "step": 2 + }, + { + "epoch": 0.0061744275791098535, + "grad_norm": 13.05551815032959, + "learning_rate": 6.185567010309279e-07, + "loss": 3.0286, + "step": 3 + }, + { + "epoch": 0.008232570105479805, + "grad_norm": 12.334521293640137, + "learning_rate": 8.247422680412372e-07, + "loss": 2.904, + "step": 4 + }, + { + "epoch": 0.010290712631849755, + "grad_norm": 12.075353622436523, + "learning_rate": 1.0309278350515464e-06, + "loss": 2.8991, + "step": 5 + }, + { + "epoch": 0.012348855158219707, + "grad_norm": 11.86032485961914, + "learning_rate": 1.2371134020618557e-06, + "loss": 3.0007, + "step": 6 + }, + { + "epoch": 0.014406997684589657, + "grad_norm": 10.10457992553711, + "learning_rate": 1.4432989690721649e-06, + "loss": 2.8493, + "step": 7 + }, + { + "epoch": 0.01646514021095961, + "grad_norm": 8.56408405303955, + "learning_rate": 1.6494845360824744e-06, + "loss": 2.9573, + "step": 8 + }, + { + "epoch": 0.01852328273732956, + "grad_norm": 6.307392120361328, + "learning_rate": 1.8556701030927837e-06, + "loss": 2.9507, + "step": 9 + }, + { + "epoch": 0.02058142526369951, + "grad_norm": 4.276430130004883, + "learning_rate": 2.061855670103093e-06, + "loss": 2.8988, + "step": 10 + }, + { + "epoch": 0.022639567790069464, + "grad_norm": 2.5912015438079834, + "learning_rate": 2.268041237113402e-06, + "loss": 2.9926, + "step": 11 + }, + { + "epoch": 0.024697710316439414, + "grad_norm": 2.018446207046509, + "learning_rate": 2.4742268041237115e-06, + "loss": 2.9874, + "step": 12 + }, + { + "epoch": 0.026755852842809364, + "grad_norm": 1.8558588027954102, + "learning_rate": 2.680412371134021e-06, + "loss": 2.8608, + "step": 13 + }, + { + "epoch": 0.028813995369179314, + "grad_norm": 1.9658265113830566, + "learning_rate": 2.8865979381443297e-06, + "loss": 2.8596, + "step": 14 + }, + { + "epoch": 0.030872137895549268, + "grad_norm": 1.872044563293457, + "learning_rate": 3.0927835051546395e-06, + "loss": 2.8836, + "step": 15 + }, + { + "epoch": 0.03293028042191922, + "grad_norm": 1.8884096145629883, + "learning_rate": 3.298969072164949e-06, + "loss": 2.9383, + "step": 16 + }, + { + "epoch": 0.03498842294828917, + "grad_norm": 1.8795744180679321, + "learning_rate": 3.5051546391752577e-06, + "loss": 2.883, + "step": 17 + }, + { + "epoch": 0.03704656547465912, + "grad_norm": 1.783678412437439, + "learning_rate": 3.7113402061855674e-06, + "loss": 2.8019, + "step": 18 + }, + { + "epoch": 0.039104708001029075, + "grad_norm": 1.820617914199829, + "learning_rate": 3.917525773195877e-06, + "loss": 2.8813, + "step": 19 + }, + { + "epoch": 0.04116285052739902, + "grad_norm": 1.8188731670379639, + "learning_rate": 4.123711340206186e-06, + "loss": 2.8401, + "step": 20 + }, + { + "epoch": 0.043220993053768975, + "grad_norm": 1.7305251359939575, + "learning_rate": 4.329896907216495e-06, + "loss": 2.7478, + "step": 21 + }, + { + "epoch": 0.04527913558013893, + "grad_norm": 1.7014551162719727, + "learning_rate": 4.536082474226804e-06, + "loss": 2.7356, + "step": 22 + }, + { + "epoch": 0.047337278106508875, + "grad_norm": 1.677381157875061, + "learning_rate": 4.742268041237113e-06, + "loss": 2.7593, + "step": 23 + }, + { + "epoch": 0.04939542063287883, + "grad_norm": 1.628554344177246, + "learning_rate": 4.948453608247423e-06, + "loss": 2.7689, + "step": 24 + }, + { + "epoch": 0.051453563159248775, + "grad_norm": 1.4968128204345703, + "learning_rate": 5.154639175257732e-06, + "loss": 2.6613, + "step": 25 + }, + { + "epoch": 0.05351170568561873, + "grad_norm": 1.4734832048416138, + "learning_rate": 5.360824742268042e-06, + "loss": 2.7095, + "step": 26 + }, + { + "epoch": 0.05556984821198868, + "grad_norm": 1.3745571374893188, + "learning_rate": 5.567010309278351e-06, + "loss": 2.655, + "step": 27 + }, + { + "epoch": 0.05762799073835863, + "grad_norm": 1.3381729125976562, + "learning_rate": 5.7731958762886594e-06, + "loss": 2.55, + "step": 28 + }, + { + "epoch": 0.05968613326472858, + "grad_norm": 1.3388073444366455, + "learning_rate": 5.979381443298969e-06, + "loss": 2.5219, + "step": 29 + }, + { + "epoch": 0.061744275791098535, + "grad_norm": 1.317008376121521, + "learning_rate": 6.185567010309279e-06, + "loss": 2.4491, + "step": 30 + }, + { + "epoch": 0.06380241831746848, + "grad_norm": 1.3210794925689697, + "learning_rate": 6.391752577319588e-06, + "loss": 2.4358, + "step": 31 + }, + { + "epoch": 0.06586056084383844, + "grad_norm": 1.182519555091858, + "learning_rate": 6.597938144329898e-06, + "loss": 2.4514, + "step": 32 + }, + { + "epoch": 0.06791870337020839, + "grad_norm": 1.2238099575042725, + "learning_rate": 6.804123711340207e-06, + "loss": 2.442, + "step": 33 + }, + { + "epoch": 0.06997684589657834, + "grad_norm": 1.1793314218521118, + "learning_rate": 7.010309278350515e-06, + "loss": 2.3864, + "step": 34 + }, + { + "epoch": 0.0720349884229483, + "grad_norm": 1.1983020305633545, + "learning_rate": 7.216494845360825e-06, + "loss": 2.3796, + "step": 35 + }, + { + "epoch": 0.07409313094931824, + "grad_norm": 1.2189652919769287, + "learning_rate": 7.422680412371135e-06, + "loss": 2.4152, + "step": 36 + }, + { + "epoch": 0.07615127347568819, + "grad_norm": 1.14923095703125, + "learning_rate": 7.628865979381444e-06, + "loss": 2.3298, + "step": 37 + }, + { + "epoch": 0.07820941600205815, + "grad_norm": 1.147013545036316, + "learning_rate": 7.835051546391754e-06, + "loss": 2.2488, + "step": 38 + }, + { + "epoch": 0.0802675585284281, + "grad_norm": 1.133981466293335, + "learning_rate": 8.041237113402063e-06, + "loss": 2.1825, + "step": 39 + }, + { + "epoch": 0.08232570105479804, + "grad_norm": 1.1686867475509644, + "learning_rate": 8.247422680412371e-06, + "loss": 2.2282, + "step": 40 + }, + { + "epoch": 0.084383843581168, + "grad_norm": 1.131690502166748, + "learning_rate": 8.453608247422681e-06, + "loss": 2.0962, + "step": 41 + }, + { + "epoch": 0.08644198610753795, + "grad_norm": 1.1626195907592773, + "learning_rate": 8.65979381443299e-06, + "loss": 2.1161, + "step": 42 + }, + { + "epoch": 0.0885001286339079, + "grad_norm": 1.1508581638336182, + "learning_rate": 8.865979381443299e-06, + "loss": 1.9856, + "step": 43 + }, + { + "epoch": 0.09055827116027786, + "grad_norm": 1.2286733388900757, + "learning_rate": 9.072164948453609e-06, + "loss": 2.076, + "step": 44 + }, + { + "epoch": 0.0926164136866478, + "grad_norm": 1.82068932056427, + "learning_rate": 9.278350515463918e-06, + "loss": 1.9995, + "step": 45 + }, + { + "epoch": 0.09467455621301775, + "grad_norm": 2.079101324081421, + "learning_rate": 9.484536082474226e-06, + "loss": 1.9601, + "step": 46 + }, + { + "epoch": 0.0967326987393877, + "grad_norm": 1.1209226846694946, + "learning_rate": 9.690721649484536e-06, + "loss": 1.9346, + "step": 47 + }, + { + "epoch": 0.09879084126575766, + "grad_norm": 1.0579711198806763, + "learning_rate": 9.896907216494846e-06, + "loss": 1.8764, + "step": 48 + }, + { + "epoch": 0.1008489837921276, + "grad_norm": 1.0434011220932007, + "learning_rate": 1.0103092783505156e-05, + "loss": 1.8483, + "step": 49 + }, + { + "epoch": 0.10290712631849755, + "grad_norm": 1.0089991092681885, + "learning_rate": 1.0309278350515464e-05, + "loss": 1.8018, + "step": 50 + }, + { + "epoch": 0.10496526884486751, + "grad_norm": 1.0117324590682983, + "learning_rate": 1.0515463917525775e-05, + "loss": 1.8003, + "step": 51 + }, + { + "epoch": 0.10702341137123746, + "grad_norm": 1.0006697177886963, + "learning_rate": 1.0721649484536083e-05, + "loss": 1.7482, + "step": 52 + }, + { + "epoch": 0.1090815538976074, + "grad_norm": 2.1164329051971436, + "learning_rate": 1.0927835051546391e-05, + "loss": 1.7363, + "step": 53 + }, + { + "epoch": 0.11113969642397736, + "grad_norm": 0.9573502540588379, + "learning_rate": 1.1134020618556703e-05, + "loss": 1.661, + "step": 54 + }, + { + "epoch": 0.11319783895034731, + "grad_norm": 1.0059764385223389, + "learning_rate": 1.134020618556701e-05, + "loss": 1.6979, + "step": 55 + }, + { + "epoch": 0.11525598147671726, + "grad_norm": 0.9719656109809875, + "learning_rate": 1.1546391752577319e-05, + "loss": 1.6318, + "step": 56 + }, + { + "epoch": 0.11731412400308722, + "grad_norm": 1.0024539232254028, + "learning_rate": 1.175257731958763e-05, + "loss": 1.6283, + "step": 57 + }, + { + "epoch": 0.11937226652945716, + "grad_norm": 0.9772456288337708, + "learning_rate": 1.1958762886597938e-05, + "loss": 1.5611, + "step": 58 + }, + { + "epoch": 0.12143040905582711, + "grad_norm": 0.9947625994682312, + "learning_rate": 1.2164948453608248e-05, + "loss": 1.6073, + "step": 59 + }, + { + "epoch": 0.12348855158219707, + "grad_norm": 2.112889051437378, + "learning_rate": 1.2371134020618558e-05, + "loss": 1.6208, + "step": 60 + }, + { + "epoch": 0.12554669410856703, + "grad_norm": 1.0515345335006714, + "learning_rate": 1.2577319587628866e-05, + "loss": 1.569, + "step": 61 + }, + { + "epoch": 0.12760483663493696, + "grad_norm": 1.0782145261764526, + "learning_rate": 1.2783505154639176e-05, + "loss": 1.5097, + "step": 62 + }, + { + "epoch": 0.12966297916130692, + "grad_norm": 1.154104232788086, + "learning_rate": 1.2989690721649485e-05, + "loss": 1.5472, + "step": 63 + }, + { + "epoch": 0.13172112168767688, + "grad_norm": 1.1614656448364258, + "learning_rate": 1.3195876288659795e-05, + "loss": 1.4833, + "step": 64 + }, + { + "epoch": 0.13377926421404682, + "grad_norm": 1.1720911264419556, + "learning_rate": 1.3402061855670103e-05, + "loss": 1.4644, + "step": 65 + }, + { + "epoch": 0.13583740674041678, + "grad_norm": 1.8903896808624268, + "learning_rate": 1.3608247422680415e-05, + "loss": 1.4286, + "step": 66 + }, + { + "epoch": 0.13789554926678674, + "grad_norm": 1.2675013542175293, + "learning_rate": 1.3814432989690723e-05, + "loss": 1.416, + "step": 67 + }, + { + "epoch": 0.13995369179315667, + "grad_norm": 1.266434907913208, + "learning_rate": 1.402061855670103e-05, + "loss": 1.3171, + "step": 68 + }, + { + "epoch": 0.14201183431952663, + "grad_norm": 1.3408889770507812, + "learning_rate": 1.4226804123711342e-05, + "loss": 1.3396, + "step": 69 + }, + { + "epoch": 0.1440699768458966, + "grad_norm": 1.3862446546554565, + "learning_rate": 1.443298969072165e-05, + "loss": 1.2642, + "step": 70 + }, + { + "epoch": 0.14612811937226652, + "grad_norm": 2.110553026199341, + "learning_rate": 1.4639175257731958e-05, + "loss": 1.2593, + "step": 71 + }, + { + "epoch": 0.14818626189863648, + "grad_norm": 1.7017499208450317, + "learning_rate": 1.484536082474227e-05, + "loss": 1.24, + "step": 72 + }, + { + "epoch": 0.15024440442500644, + "grad_norm": 1.9851700067520142, + "learning_rate": 1.5051546391752578e-05, + "loss": 1.2313, + "step": 73 + }, + { + "epoch": 0.15230254695137638, + "grad_norm": 2.009608030319214, + "learning_rate": 1.5257731958762888e-05, + "loss": 1.1281, + "step": 74 + }, + { + "epoch": 0.15436068947774634, + "grad_norm": 2.7587485313415527, + "learning_rate": 1.5463917525773197e-05, + "loss": 1.1248, + "step": 75 + }, + { + "epoch": 0.1564188320041163, + "grad_norm": 2.780954599380493, + "learning_rate": 1.5670103092783507e-05, + "loss": 1.0797, + "step": 76 + }, + { + "epoch": 0.15847697453048623, + "grad_norm": 3.1470866203308105, + "learning_rate": 1.5876288659793813e-05, + "loss": 1.0064, + "step": 77 + }, + { + "epoch": 0.1605351170568562, + "grad_norm": 4.653595447540283, + "learning_rate": 1.6082474226804127e-05, + "loss": 0.9219, + "step": 78 + }, + { + "epoch": 0.16259325958322615, + "grad_norm": 4.157363414764404, + "learning_rate": 1.6288659793814433e-05, + "loss": 0.8709, + "step": 79 + }, + { + "epoch": 0.16465140210959608, + "grad_norm": 4.5814924240112305, + "learning_rate": 1.6494845360824743e-05, + "loss": 0.7693, + "step": 80 + }, + { + "epoch": 0.16670954463596604, + "grad_norm": 5.096139907836914, + "learning_rate": 1.6701030927835052e-05, + "loss": 0.6868, + "step": 81 + }, + { + "epoch": 0.168767687162336, + "grad_norm": 4.858880519866943, + "learning_rate": 1.6907216494845362e-05, + "loss": 0.5971, + "step": 82 + }, + { + "epoch": 0.17082582968870594, + "grad_norm": 4.42564582824707, + "learning_rate": 1.7113402061855672e-05, + "loss": 0.4719, + "step": 83 + }, + { + "epoch": 0.1728839722150759, + "grad_norm": 7.720851421356201, + "learning_rate": 1.731958762886598e-05, + "loss": 0.3943, + "step": 84 + }, + { + "epoch": 0.17494211474144586, + "grad_norm": 0.41923192143440247, + "learning_rate": 1.752577319587629e-05, + "loss": 0.3635, + "step": 85 + }, + { + "epoch": 0.1770002572678158, + "grad_norm": 0.2771846354007721, + "learning_rate": 1.7731958762886598e-05, + "loss": 0.3597, + "step": 86 + }, + { + "epoch": 0.17905839979418575, + "grad_norm": 0.24761857092380524, + "learning_rate": 1.793814432989691e-05, + "loss": 0.3735, + "step": 87 + }, + { + "epoch": 0.1811165423205557, + "grad_norm": 0.23277048766613007, + "learning_rate": 1.8144329896907217e-05, + "loss": 0.3643, + "step": 88 + }, + { + "epoch": 0.18317468484692565, + "grad_norm": 0.22931228578090668, + "learning_rate": 1.8350515463917527e-05, + "loss": 0.3519, + "step": 89 + }, + { + "epoch": 0.1852328273732956, + "grad_norm": 0.20750615000724792, + "learning_rate": 1.8556701030927837e-05, + "loss": 0.3431, + "step": 90 + }, + { + "epoch": 0.18729096989966554, + "grad_norm": 0.2080322951078415, + "learning_rate": 1.8762886597938147e-05, + "loss": 0.3632, + "step": 91 + }, + { + "epoch": 0.1893491124260355, + "grad_norm": 0.20186181366443634, + "learning_rate": 1.8969072164948453e-05, + "loss": 0.3492, + "step": 92 + }, + { + "epoch": 0.19140725495240546, + "grad_norm": 0.19172786176204681, + "learning_rate": 1.9175257731958766e-05, + "loss": 0.3552, + "step": 93 + }, + { + "epoch": 0.1934653974787754, + "grad_norm": 0.1747850626707077, + "learning_rate": 1.9381443298969072e-05, + "loss": 0.3355, + "step": 94 + }, + { + "epoch": 0.19552354000514535, + "grad_norm": 0.196411594748497, + "learning_rate": 1.9587628865979382e-05, + "loss": 0.3271, + "step": 95 + }, + { + "epoch": 0.1975816825315153, + "grad_norm": 0.20063228905200958, + "learning_rate": 1.9793814432989692e-05, + "loss": 0.3351, + "step": 96 + }, + { + "epoch": 0.19963982505788525, + "grad_norm": 0.19240939617156982, + "learning_rate": 2e-05, + "loss": 0.3266, + "step": 97 + }, + { + "epoch": 0.2016979675842552, + "grad_norm": 0.18206572532653809, + "learning_rate": 1.997709049255441e-05, + "loss": 0.3393, + "step": 98 + }, + { + "epoch": 0.20375611011062517, + "grad_norm": 0.20384562015533447, + "learning_rate": 1.9954180985108823e-05, + "loss": 0.3395, + "step": 99 + }, + { + "epoch": 0.2058142526369951, + "grad_norm": 0.19944581389427185, + "learning_rate": 1.9931271477663232e-05, + "loss": 0.3268, + "step": 100 + }, + { + "epoch": 0.2058142526369951, + "eval_loss": 0.3456890285015106, + "eval_runtime": 2114.0178, + "eval_samples_per_second": 3.677, + "eval_steps_per_second": 0.92, + "step": 100 + }, + { + "epoch": 0.20787239516336506, + "grad_norm": 0.17743557691574097, + "learning_rate": 1.990836197021764e-05, + "loss": 0.3439, + "step": 101 + }, + { + "epoch": 0.20993053768973502, + "grad_norm": 0.18746449053287506, + "learning_rate": 1.9885452462772053e-05, + "loss": 0.326, + "step": 102 + }, + { + "epoch": 0.21198868021610495, + "grad_norm": 0.18555815517902374, + "learning_rate": 1.9862542955326462e-05, + "loss": 0.3337, + "step": 103 + }, + { + "epoch": 0.2140468227424749, + "grad_norm": 0.16591575741767883, + "learning_rate": 1.9839633447880874e-05, + "loss": 0.3121, + "step": 104 + }, + { + "epoch": 0.21610496526884487, + "grad_norm": 0.1621987372636795, + "learning_rate": 1.9816723940435283e-05, + "loss": 0.3287, + "step": 105 + }, + { + "epoch": 0.2181631077952148, + "grad_norm": 0.1614532470703125, + "learning_rate": 1.9793814432989692e-05, + "loss": 0.3306, + "step": 106 + }, + { + "epoch": 0.22022125032158477, + "grad_norm": 0.17993387579917908, + "learning_rate": 1.9770904925544104e-05, + "loss": 0.3341, + "step": 107 + }, + { + "epoch": 0.22227939284795473, + "grad_norm": 0.1550011783838272, + "learning_rate": 1.9747995418098513e-05, + "loss": 0.3197, + "step": 108 + }, + { + "epoch": 0.22433753537432466, + "grad_norm": 0.18471524119377136, + "learning_rate": 1.9725085910652922e-05, + "loss": 0.3285, + "step": 109 + }, + { + "epoch": 0.22639567790069462, + "grad_norm": 0.15604373812675476, + "learning_rate": 1.9702176403207334e-05, + "loss": 0.3298, + "step": 110 + }, + { + "epoch": 0.22845382042706458, + "grad_norm": 0.1682298630475998, + "learning_rate": 1.9679266895761743e-05, + "loss": 0.3343, + "step": 111 + }, + { + "epoch": 0.2305119629534345, + "grad_norm": 0.14933635294437408, + "learning_rate": 1.9656357388316152e-05, + "loss": 0.3134, + "step": 112 + }, + { + "epoch": 0.23257010547980447, + "grad_norm": 0.14892347157001495, + "learning_rate": 1.963344788087056e-05, + "loss": 0.3154, + "step": 113 + }, + { + "epoch": 0.23462824800617443, + "grad_norm": 0.1577889323234558, + "learning_rate": 1.9610538373424973e-05, + "loss": 0.3122, + "step": 114 + }, + { + "epoch": 0.23668639053254437, + "grad_norm": 0.16482344269752502, + "learning_rate": 1.9587628865979382e-05, + "loss": 0.3193, + "step": 115 + }, + { + "epoch": 0.23874453305891433, + "grad_norm": 0.15328913927078247, + "learning_rate": 1.956471935853379e-05, + "loss": 0.3217, + "step": 116 + }, + { + "epoch": 0.2408026755852843, + "grad_norm": 0.16140656173229218, + "learning_rate": 1.9541809851088203e-05, + "loss": 0.318, + "step": 117 + }, + { + "epoch": 0.24286081811165422, + "grad_norm": 0.15448373556137085, + "learning_rate": 1.9518900343642612e-05, + "loss": 0.3205, + "step": 118 + }, + { + "epoch": 0.24491896063802418, + "grad_norm": 0.14716887474060059, + "learning_rate": 1.9495990836197025e-05, + "loss": 0.3164, + "step": 119 + }, + { + "epoch": 0.24697710316439414, + "grad_norm": 0.16582027077674866, + "learning_rate": 1.9473081328751433e-05, + "loss": 0.3191, + "step": 120 + }, + { + "epoch": 0.24903524569076407, + "grad_norm": 0.15213699638843536, + "learning_rate": 1.9450171821305842e-05, + "loss": 0.304, + "step": 121 + }, + { + "epoch": 0.25109338821713406, + "grad_norm": 0.1659238487482071, + "learning_rate": 1.9427262313860255e-05, + "loss": 0.3184, + "step": 122 + }, + { + "epoch": 0.253151530743504, + "grad_norm": 0.15596656501293182, + "learning_rate": 1.9404352806414663e-05, + "loss": 0.3092, + "step": 123 + }, + { + "epoch": 0.2552096732698739, + "grad_norm": 0.15868476033210754, + "learning_rate": 1.9381443298969072e-05, + "loss": 0.3163, + "step": 124 + }, + { + "epoch": 0.2572678157962439, + "grad_norm": 0.15386095643043518, + "learning_rate": 1.9358533791523485e-05, + "loss": 0.3049, + "step": 125 + }, + { + "epoch": 0.25932595832261385, + "grad_norm": 0.15179213881492615, + "learning_rate": 1.9335624284077894e-05, + "loss": 0.3131, + "step": 126 + }, + { + "epoch": 0.2613841008489838, + "grad_norm": 0.1595134735107422, + "learning_rate": 1.9312714776632306e-05, + "loss": 0.3069, + "step": 127 + }, + { + "epoch": 0.26344224337535377, + "grad_norm": 0.16989803314208984, + "learning_rate": 1.9289805269186715e-05, + "loss": 0.3052, + "step": 128 + }, + { + "epoch": 0.2655003859017237, + "grad_norm": 0.14803892374038696, + "learning_rate": 1.9266895761741124e-05, + "loss": 0.3065, + "step": 129 + }, + { + "epoch": 0.26755852842809363, + "grad_norm": 0.16676583886146545, + "learning_rate": 1.9243986254295536e-05, + "loss": 0.2962, + "step": 130 + }, + { + "epoch": 0.2696166709544636, + "grad_norm": 0.15694552659988403, + "learning_rate": 1.9221076746849945e-05, + "loss": 0.3096, + "step": 131 + }, + { + "epoch": 0.27167481348083355, + "grad_norm": 0.17696696519851685, + "learning_rate": 1.9198167239404354e-05, + "loss": 0.3145, + "step": 132 + }, + { + "epoch": 0.2737329560072035, + "grad_norm": 0.17204038798809052, + "learning_rate": 1.9175257731958766e-05, + "loss": 0.3248, + "step": 133 + }, + { + "epoch": 0.2757910985335735, + "grad_norm": 0.15630359947681427, + "learning_rate": 1.9152348224513175e-05, + "loss": 0.3117, + "step": 134 + }, + { + "epoch": 0.2778492410599434, + "grad_norm": 0.15757997334003448, + "learning_rate": 1.9129438717067584e-05, + "loss": 0.3145, + "step": 135 + }, + { + "epoch": 0.27990738358631334, + "grad_norm": 0.16273653507232666, + "learning_rate": 1.9106529209621996e-05, + "loss": 0.3159, + "step": 136 + }, + { + "epoch": 0.28196552611268333, + "grad_norm": 0.16213104128837585, + "learning_rate": 1.9083619702176405e-05, + "loss": 0.2949, + "step": 137 + }, + { + "epoch": 0.28402366863905326, + "grad_norm": 0.15377865731716156, + "learning_rate": 1.9060710194730814e-05, + "loss": 0.306, + "step": 138 + }, + { + "epoch": 0.2860818111654232, + "grad_norm": 0.1545962244272232, + "learning_rate": 1.9037800687285223e-05, + "loss": 0.2966, + "step": 139 + }, + { + "epoch": 0.2881399536917932, + "grad_norm": 0.15516617894172668, + "learning_rate": 1.9014891179839635e-05, + "loss": 0.3122, + "step": 140 + }, + { + "epoch": 0.2901980962181631, + "grad_norm": 0.14734458923339844, + "learning_rate": 1.8991981672394044e-05, + "loss": 0.3118, + "step": 141 + }, + { + "epoch": 0.29225623874453305, + "grad_norm": 0.1644304096698761, + "learning_rate": 1.8969072164948453e-05, + "loss": 0.3027, + "step": 142 + }, + { + "epoch": 0.29431438127090304, + "grad_norm": 0.14632569253444672, + "learning_rate": 1.8946162657502865e-05, + "loss": 0.3023, + "step": 143 + }, + { + "epoch": 0.29637252379727297, + "grad_norm": 0.1573137789964676, + "learning_rate": 1.8923253150057274e-05, + "loss": 0.3102, + "step": 144 + }, + { + "epoch": 0.2984306663236429, + "grad_norm": 0.16423144936561584, + "learning_rate": 1.8900343642611686e-05, + "loss": 0.3033, + "step": 145 + }, + { + "epoch": 0.3004888088500129, + "grad_norm": 0.15420907735824585, + "learning_rate": 1.8877434135166095e-05, + "loss": 0.3089, + "step": 146 + }, + { + "epoch": 0.3025469513763828, + "grad_norm": 0.1579178273677826, + "learning_rate": 1.8854524627720504e-05, + "loss": 0.3071, + "step": 147 + }, + { + "epoch": 0.30460509390275275, + "grad_norm": 0.15866397321224213, + "learning_rate": 1.8831615120274916e-05, + "loss": 0.3083, + "step": 148 + }, + { + "epoch": 0.30666323642912274, + "grad_norm": 0.16651487350463867, + "learning_rate": 1.8808705612829325e-05, + "loss": 0.3099, + "step": 149 + }, + { + "epoch": 0.3087213789554927, + "grad_norm": 0.16281908750534058, + "learning_rate": 1.8785796105383734e-05, + "loss": 0.3034, + "step": 150 + }, + { + "epoch": 0.3107795214818626, + "grad_norm": 0.17449837923049927, + "learning_rate": 1.8762886597938147e-05, + "loss": 0.3054, + "step": 151 + }, + { + "epoch": 0.3128376640082326, + "grad_norm": 0.15403546392917633, + "learning_rate": 1.8739977090492555e-05, + "loss": 0.297, + "step": 152 + }, + { + "epoch": 0.31489580653460253, + "grad_norm": 0.1472466140985489, + "learning_rate": 1.8717067583046968e-05, + "loss": 0.2973, + "step": 153 + }, + { + "epoch": 0.31695394906097246, + "grad_norm": 0.16027937829494476, + "learning_rate": 1.8694158075601377e-05, + "loss": 0.3054, + "step": 154 + }, + { + "epoch": 0.31901209158734245, + "grad_norm": 0.17086225748062134, + "learning_rate": 1.8671248568155786e-05, + "loss": 0.307, + "step": 155 + }, + { + "epoch": 0.3210702341137124, + "grad_norm": 0.15930697321891785, + "learning_rate": 1.8648339060710198e-05, + "loss": 0.293, + "step": 156 + }, + { + "epoch": 0.3231283766400823, + "grad_norm": 0.17086376249790192, + "learning_rate": 1.8625429553264607e-05, + "loss": 0.293, + "step": 157 + }, + { + "epoch": 0.3251865191664523, + "grad_norm": 0.15970875322818756, + "learning_rate": 1.8602520045819016e-05, + "loss": 0.3083, + "step": 158 + }, + { + "epoch": 0.32724466169282224, + "grad_norm": 0.16355909407138824, + "learning_rate": 1.8579610538373428e-05, + "loss": 0.3139, + "step": 159 + }, + { + "epoch": 0.32930280421919217, + "grad_norm": 0.15183711051940918, + "learning_rate": 1.8556701030927837e-05, + "loss": 0.2953, + "step": 160 + }, + { + "epoch": 0.33136094674556216, + "grad_norm": 0.15123715996742249, + "learning_rate": 1.853379152348225e-05, + "loss": 0.3025, + "step": 161 + }, + { + "epoch": 0.3334190892719321, + "grad_norm": 0.1576143503189087, + "learning_rate": 1.8510882016036658e-05, + "loss": 0.2904, + "step": 162 + }, + { + "epoch": 0.335477231798302, + "grad_norm": 0.1457504779100418, + "learning_rate": 1.8487972508591067e-05, + "loss": 0.2909, + "step": 163 + }, + { + "epoch": 0.337535374324672, + "grad_norm": 0.1557442992925644, + "learning_rate": 1.846506300114548e-05, + "loss": 0.3027, + "step": 164 + }, + { + "epoch": 0.33959351685104194, + "grad_norm": 0.15662318468093872, + "learning_rate": 1.8442153493699888e-05, + "loss": 0.311, + "step": 165 + }, + { + "epoch": 0.3416516593774119, + "grad_norm": 0.16177058219909668, + "learning_rate": 1.8419243986254297e-05, + "loss": 0.2944, + "step": 166 + }, + { + "epoch": 0.34370980190378186, + "grad_norm": 0.16406729817390442, + "learning_rate": 1.8396334478808706e-05, + "loss": 0.2927, + "step": 167 + }, + { + "epoch": 0.3457679444301518, + "grad_norm": 0.16642791032791138, + "learning_rate": 1.8373424971363115e-05, + "loss": 0.3063, + "step": 168 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.1650693714618683, + "learning_rate": 1.8350515463917527e-05, + "loss": 0.2957, + "step": 169 + }, + { + "epoch": 0.3498842294828917, + "grad_norm": 0.15349675714969635, + "learning_rate": 1.8327605956471936e-05, + "loss": 0.297, + "step": 170 + }, + { + "epoch": 0.35194237200926165, + "grad_norm": 0.17770209908485413, + "learning_rate": 1.8304696449026348e-05, + "loss": 0.3011, + "step": 171 + }, + { + "epoch": 0.3540005145356316, + "grad_norm": 0.1647631675004959, + "learning_rate": 1.8281786941580757e-05, + "loss": 0.2962, + "step": 172 + }, + { + "epoch": 0.35605865706200157, + "grad_norm": 0.1603834480047226, + "learning_rate": 1.8258877434135166e-05, + "loss": 0.2937, + "step": 173 + }, + { + "epoch": 0.3581167995883715, + "grad_norm": 0.16780880093574524, + "learning_rate": 1.8235967926689578e-05, + "loss": 0.2997, + "step": 174 + }, + { + "epoch": 0.36017494211474144, + "grad_norm": 0.15976767241954803, + "learning_rate": 1.8213058419243987e-05, + "loss": 0.3043, + "step": 175 + }, + { + "epoch": 0.3622330846411114, + "grad_norm": 0.16236485540866852, + "learning_rate": 1.8190148911798396e-05, + "loss": 0.3069, + "step": 176 + }, + { + "epoch": 0.36429122716748136, + "grad_norm": 0.16391968727111816, + "learning_rate": 1.816723940435281e-05, + "loss": 0.2923, + "step": 177 + }, + { + "epoch": 0.3663493696938513, + "grad_norm": 0.15806889533996582, + "learning_rate": 1.8144329896907217e-05, + "loss": 0.2872, + "step": 178 + }, + { + "epoch": 0.3684075122202212, + "grad_norm": 0.1627352088689804, + "learning_rate": 1.812142038946163e-05, + "loss": 0.3032, + "step": 179 + }, + { + "epoch": 0.3704656547465912, + "grad_norm": 0.15103371441364288, + "learning_rate": 1.809851088201604e-05, + "loss": 0.2847, + "step": 180 + }, + { + "epoch": 0.37252379727296114, + "grad_norm": 0.15178488194942474, + "learning_rate": 1.8075601374570447e-05, + "loss": 0.3017, + "step": 181 + }, + { + "epoch": 0.3745819397993311, + "grad_norm": 0.15493899583816528, + "learning_rate": 1.805269186712486e-05, + "loss": 0.2901, + "step": 182 + }, + { + "epoch": 0.37664008232570106, + "grad_norm": 0.15990686416625977, + "learning_rate": 1.802978235967927e-05, + "loss": 0.2861, + "step": 183 + }, + { + "epoch": 0.378698224852071, + "grad_norm": 0.15824148058891296, + "learning_rate": 1.8006872852233677e-05, + "loss": 0.2885, + "step": 184 + }, + { + "epoch": 0.38075636737844093, + "grad_norm": 0.15690775215625763, + "learning_rate": 1.798396334478809e-05, + "loss": 0.2814, + "step": 185 + }, + { + "epoch": 0.3828145099048109, + "grad_norm": 0.15833796560764313, + "learning_rate": 1.79610538373425e-05, + "loss": 0.2847, + "step": 186 + }, + { + "epoch": 0.38487265243118085, + "grad_norm": 0.16560044884681702, + "learning_rate": 1.793814432989691e-05, + "loss": 0.3061, + "step": 187 + }, + { + "epoch": 0.3869307949575508, + "grad_norm": 0.16240179538726807, + "learning_rate": 1.791523482245132e-05, + "loss": 0.2943, + "step": 188 + }, + { + "epoch": 0.38898893748392077, + "grad_norm": 0.15825721621513367, + "learning_rate": 1.789232531500573e-05, + "loss": 0.2934, + "step": 189 + }, + { + "epoch": 0.3910470800102907, + "grad_norm": 0.16665388643741608, + "learning_rate": 1.786941580756014e-05, + "loss": 0.291, + "step": 190 + }, + { + "epoch": 0.39310522253666064, + "grad_norm": 0.16581200063228607, + "learning_rate": 1.784650630011455e-05, + "loss": 0.2849, + "step": 191 + }, + { + "epoch": 0.3951633650630306, + "grad_norm": 0.1604345291852951, + "learning_rate": 1.782359679266896e-05, + "loss": 0.3, + "step": 192 + }, + { + "epoch": 0.39722150758940056, + "grad_norm": 0.16107915341854095, + "learning_rate": 1.7800687285223368e-05, + "loss": 0.2847, + "step": 193 + }, + { + "epoch": 0.3992796501157705, + "grad_norm": 0.1571730375289917, + "learning_rate": 1.7777777777777777e-05, + "loss": 0.2863, + "step": 194 + }, + { + "epoch": 0.4013377926421405, + "grad_norm": 0.1656399518251419, + "learning_rate": 1.775486827033219e-05, + "loss": 0.2878, + "step": 195 + }, + { + "epoch": 0.4033959351685104, + "grad_norm": 0.16738460958003998, + "learning_rate": 1.7731958762886598e-05, + "loss": 0.286, + "step": 196 + }, + { + "epoch": 0.40545407769488034, + "grad_norm": 0.16704292595386505, + "learning_rate": 1.770904925544101e-05, + "loss": 0.2919, + "step": 197 + }, + { + "epoch": 0.40751222022125033, + "grad_norm": 0.16215579211711884, + "learning_rate": 1.768613974799542e-05, + "loss": 0.2874, + "step": 198 + }, + { + "epoch": 0.40957036274762026, + "grad_norm": 0.15573479235172272, + "learning_rate": 1.7663230240549828e-05, + "loss": 0.2904, + "step": 199 + }, + { + "epoch": 0.4116285052739902, + "grad_norm": 0.1707623153924942, + "learning_rate": 1.764032073310424e-05, + "loss": 0.289, + "step": 200 + }, + { + "epoch": 0.4116285052739902, + "eval_loss": 0.3214050829410553, + "eval_runtime": 2449.7742, + "eval_samples_per_second": 3.173, + "eval_steps_per_second": 0.794, + "step": 200 + } + ], + "logging_steps": 1, + "max_steps": 970, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.615055995939963e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/training_args.bin b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5999c7ee9dd10ee9076d748e4757533e635fa832 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee55a11f5a306eb7c39b536fdfe2459bc279e468da50f6adda478c4deffcb812 +size 5688 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/README.md b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0d77d70fdc5c829c8889cb85828736b7eb9714 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/codegemma-7b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/adapter_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e841602c6a59fc7b085ac647af4d4c312445d261 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/codegemma-7b-it", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "q_proj", + "down_proj", + "gate_proj", + "k_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/adapter_model.safetensors b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29792b39363712712a910b5d1e44e8831ab392b1 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4904f256bc0d01bcdb1e0e13e3ffde750efe2e272e6acbcc661481ab3b7afccd +size 800116456 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/optimizer.pt b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d13d222503fbcaa99a78c30e5fbbe9c35807f38c --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c65cc37f7b35b81a9c19fbda436234ae27b3e7ffe674a3420cf814c7f38a722b +size 406743860 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/rng_state.pth b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..c106dea4cfea43d2dcc6a94703e83cee33c84ddb --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b36f846e18f654c3cfee105cea70c9e74df15bfcb4fc79f4b82ed01c58b83ae3 +size 14244 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/scheduler.pt b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4220dc2d992333018ab3f5ac1f3319d021dd9ba6 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c92157e090687c4e3d6cd57a32a0d42e6f559fd56b8e5771861e40841b6ea7c3 +size 1064 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/special_tokens_map.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/tokenizer.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..45a5e23f54141c5f4f97a8d58f3ffadc28e287ba --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d964a2c8346d40f95791533eae48730d5f163c2e65fd16333560fd3e661df318 +size 34362915 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/tokenizer.model b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..71a98ce40269d847e58957e1e070d9ae8eb184af --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:583f2ebd2a1936009b7da991ea255504db68c7a9713a78673d1335a87098966c +size 4241023 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/tokenizer_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9b9b1b4acdd4afcedae39d1cf6f0bc7ef7d9910f --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/tokenizer_config.json @@ -0,0 +1,2011 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "<|file_separator|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "left", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/trainer_state.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..db31f434a9ab74a564e84ee22d93cd1a1aefbe15 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/trainer_state.json @@ -0,0 +1,2158 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6174427579109854, + "eval_steps": 100, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0020581425263699513, + "grad_norm": 11.994463920593262, + "learning_rate": 2.061855670103093e-07, + "loss": 2.91, + "step": 1 + }, + { + "epoch": 0.004116285052739903, + "grad_norm": 11.769092559814453, + "learning_rate": 4.123711340206186e-07, + "loss": 2.8686, + "step": 2 + }, + { + "epoch": 0.0061744275791098535, + "grad_norm": 13.05551815032959, + "learning_rate": 6.185567010309279e-07, + "loss": 3.0286, + "step": 3 + }, + { + "epoch": 0.008232570105479805, + "grad_norm": 12.334521293640137, + "learning_rate": 8.247422680412372e-07, + "loss": 2.904, + "step": 4 + }, + { + "epoch": 0.010290712631849755, + "grad_norm": 12.075353622436523, + "learning_rate": 1.0309278350515464e-06, + "loss": 2.8991, + "step": 5 + }, + { + "epoch": 0.012348855158219707, + "grad_norm": 11.86032485961914, + "learning_rate": 1.2371134020618557e-06, + "loss": 3.0007, + "step": 6 + }, + { + "epoch": 0.014406997684589657, + "grad_norm": 10.10457992553711, + "learning_rate": 1.4432989690721649e-06, + "loss": 2.8493, + "step": 7 + }, + { + "epoch": 0.01646514021095961, + "grad_norm": 8.56408405303955, + "learning_rate": 1.6494845360824744e-06, + "loss": 2.9573, + "step": 8 + }, + { + "epoch": 0.01852328273732956, + "grad_norm": 6.307392120361328, + "learning_rate": 1.8556701030927837e-06, + "loss": 2.9507, + "step": 9 + }, + { + "epoch": 0.02058142526369951, + "grad_norm": 4.276430130004883, + "learning_rate": 2.061855670103093e-06, + "loss": 2.8988, + "step": 10 + }, + { + "epoch": 0.022639567790069464, + "grad_norm": 2.5912015438079834, + "learning_rate": 2.268041237113402e-06, + "loss": 2.9926, + "step": 11 + }, + { + "epoch": 0.024697710316439414, + "grad_norm": 2.018446207046509, + "learning_rate": 2.4742268041237115e-06, + "loss": 2.9874, + "step": 12 + }, + { + "epoch": 0.026755852842809364, + "grad_norm": 1.8558588027954102, + "learning_rate": 2.680412371134021e-06, + "loss": 2.8608, + "step": 13 + }, + { + "epoch": 0.028813995369179314, + "grad_norm": 1.9658265113830566, + "learning_rate": 2.8865979381443297e-06, + "loss": 2.8596, + "step": 14 + }, + { + "epoch": 0.030872137895549268, + "grad_norm": 1.872044563293457, + "learning_rate": 3.0927835051546395e-06, + "loss": 2.8836, + "step": 15 + }, + { + "epoch": 0.03293028042191922, + "grad_norm": 1.8884096145629883, + "learning_rate": 3.298969072164949e-06, + "loss": 2.9383, + "step": 16 + }, + { + "epoch": 0.03498842294828917, + "grad_norm": 1.8795744180679321, + "learning_rate": 3.5051546391752577e-06, + "loss": 2.883, + "step": 17 + }, + { + "epoch": 0.03704656547465912, + "grad_norm": 1.783678412437439, + "learning_rate": 3.7113402061855674e-06, + "loss": 2.8019, + "step": 18 + }, + { + "epoch": 0.039104708001029075, + "grad_norm": 1.820617914199829, + "learning_rate": 3.917525773195877e-06, + "loss": 2.8813, + "step": 19 + }, + { + "epoch": 0.04116285052739902, + "grad_norm": 1.8188731670379639, + "learning_rate": 4.123711340206186e-06, + "loss": 2.8401, + "step": 20 + }, + { + "epoch": 0.043220993053768975, + "grad_norm": 1.7305251359939575, + "learning_rate": 4.329896907216495e-06, + "loss": 2.7478, + "step": 21 + }, + { + "epoch": 0.04527913558013893, + "grad_norm": 1.7014551162719727, + "learning_rate": 4.536082474226804e-06, + "loss": 2.7356, + "step": 22 + }, + { + "epoch": 0.047337278106508875, + "grad_norm": 1.677381157875061, + "learning_rate": 4.742268041237113e-06, + "loss": 2.7593, + "step": 23 + }, + { + "epoch": 0.04939542063287883, + "grad_norm": 1.628554344177246, + "learning_rate": 4.948453608247423e-06, + "loss": 2.7689, + "step": 24 + }, + { + "epoch": 0.051453563159248775, + "grad_norm": 1.4968128204345703, + "learning_rate": 5.154639175257732e-06, + "loss": 2.6613, + "step": 25 + }, + { + "epoch": 0.05351170568561873, + "grad_norm": 1.4734832048416138, + "learning_rate": 5.360824742268042e-06, + "loss": 2.7095, + "step": 26 + }, + { + "epoch": 0.05556984821198868, + "grad_norm": 1.3745571374893188, + "learning_rate": 5.567010309278351e-06, + "loss": 2.655, + "step": 27 + }, + { + "epoch": 0.05762799073835863, + "grad_norm": 1.3381729125976562, + "learning_rate": 5.7731958762886594e-06, + "loss": 2.55, + "step": 28 + }, + { + "epoch": 0.05968613326472858, + "grad_norm": 1.3388073444366455, + "learning_rate": 5.979381443298969e-06, + "loss": 2.5219, + "step": 29 + }, + { + "epoch": 0.061744275791098535, + "grad_norm": 1.317008376121521, + "learning_rate": 6.185567010309279e-06, + "loss": 2.4491, + "step": 30 + }, + { + "epoch": 0.06380241831746848, + "grad_norm": 1.3210794925689697, + "learning_rate": 6.391752577319588e-06, + "loss": 2.4358, + "step": 31 + }, + { + "epoch": 0.06586056084383844, + "grad_norm": 1.182519555091858, + "learning_rate": 6.597938144329898e-06, + "loss": 2.4514, + "step": 32 + }, + { + "epoch": 0.06791870337020839, + "grad_norm": 1.2238099575042725, + "learning_rate": 6.804123711340207e-06, + "loss": 2.442, + "step": 33 + }, + { + "epoch": 0.06997684589657834, + "grad_norm": 1.1793314218521118, + "learning_rate": 7.010309278350515e-06, + "loss": 2.3864, + "step": 34 + }, + { + "epoch": 0.0720349884229483, + "grad_norm": 1.1983020305633545, + "learning_rate": 7.216494845360825e-06, + "loss": 2.3796, + "step": 35 + }, + { + "epoch": 0.07409313094931824, + "grad_norm": 1.2189652919769287, + "learning_rate": 7.422680412371135e-06, + "loss": 2.4152, + "step": 36 + }, + { + "epoch": 0.07615127347568819, + "grad_norm": 1.14923095703125, + "learning_rate": 7.628865979381444e-06, + "loss": 2.3298, + "step": 37 + }, + { + "epoch": 0.07820941600205815, + "grad_norm": 1.147013545036316, + "learning_rate": 7.835051546391754e-06, + "loss": 2.2488, + "step": 38 + }, + { + "epoch": 0.0802675585284281, + "grad_norm": 1.133981466293335, + "learning_rate": 8.041237113402063e-06, + "loss": 2.1825, + "step": 39 + }, + { + "epoch": 0.08232570105479804, + "grad_norm": 1.1686867475509644, + "learning_rate": 8.247422680412371e-06, + "loss": 2.2282, + "step": 40 + }, + { + "epoch": 0.084383843581168, + "grad_norm": 1.131690502166748, + "learning_rate": 8.453608247422681e-06, + "loss": 2.0962, + "step": 41 + }, + { + "epoch": 0.08644198610753795, + "grad_norm": 1.1626195907592773, + "learning_rate": 8.65979381443299e-06, + "loss": 2.1161, + "step": 42 + }, + { + "epoch": 0.0885001286339079, + "grad_norm": 1.1508581638336182, + "learning_rate": 8.865979381443299e-06, + "loss": 1.9856, + "step": 43 + }, + { + "epoch": 0.09055827116027786, + "grad_norm": 1.2286733388900757, + "learning_rate": 9.072164948453609e-06, + "loss": 2.076, + "step": 44 + }, + { + "epoch": 0.0926164136866478, + "grad_norm": 1.82068932056427, + "learning_rate": 9.278350515463918e-06, + "loss": 1.9995, + "step": 45 + }, + { + "epoch": 0.09467455621301775, + "grad_norm": 2.079101324081421, + "learning_rate": 9.484536082474226e-06, + "loss": 1.9601, + "step": 46 + }, + { + "epoch": 0.0967326987393877, + "grad_norm": 1.1209226846694946, + "learning_rate": 9.690721649484536e-06, + "loss": 1.9346, + "step": 47 + }, + { + "epoch": 0.09879084126575766, + "grad_norm": 1.0579711198806763, + "learning_rate": 9.896907216494846e-06, + "loss": 1.8764, + "step": 48 + }, + { + "epoch": 0.1008489837921276, + "grad_norm": 1.0434011220932007, + "learning_rate": 1.0103092783505156e-05, + "loss": 1.8483, + "step": 49 + }, + { + "epoch": 0.10290712631849755, + "grad_norm": 1.0089991092681885, + "learning_rate": 1.0309278350515464e-05, + "loss": 1.8018, + "step": 50 + }, + { + "epoch": 0.10496526884486751, + "grad_norm": 1.0117324590682983, + "learning_rate": 1.0515463917525775e-05, + "loss": 1.8003, + "step": 51 + }, + { + "epoch": 0.10702341137123746, + "grad_norm": 1.0006697177886963, + "learning_rate": 1.0721649484536083e-05, + "loss": 1.7482, + "step": 52 + }, + { + "epoch": 0.1090815538976074, + "grad_norm": 2.1164329051971436, + "learning_rate": 1.0927835051546391e-05, + "loss": 1.7363, + "step": 53 + }, + { + "epoch": 0.11113969642397736, + "grad_norm": 0.9573502540588379, + "learning_rate": 1.1134020618556703e-05, + "loss": 1.661, + "step": 54 + }, + { + "epoch": 0.11319783895034731, + "grad_norm": 1.0059764385223389, + "learning_rate": 1.134020618556701e-05, + "loss": 1.6979, + "step": 55 + }, + { + "epoch": 0.11525598147671726, + "grad_norm": 0.9719656109809875, + "learning_rate": 1.1546391752577319e-05, + "loss": 1.6318, + "step": 56 + }, + { + "epoch": 0.11731412400308722, + "grad_norm": 1.0024539232254028, + "learning_rate": 1.175257731958763e-05, + "loss": 1.6283, + "step": 57 + }, + { + "epoch": 0.11937226652945716, + "grad_norm": 0.9772456288337708, + "learning_rate": 1.1958762886597938e-05, + "loss": 1.5611, + "step": 58 + }, + { + "epoch": 0.12143040905582711, + "grad_norm": 0.9947625994682312, + "learning_rate": 1.2164948453608248e-05, + "loss": 1.6073, + "step": 59 + }, + { + "epoch": 0.12348855158219707, + "grad_norm": 2.112889051437378, + "learning_rate": 1.2371134020618558e-05, + "loss": 1.6208, + "step": 60 + }, + { + "epoch": 0.12554669410856703, + "grad_norm": 1.0515345335006714, + "learning_rate": 1.2577319587628866e-05, + "loss": 1.569, + "step": 61 + }, + { + "epoch": 0.12760483663493696, + "grad_norm": 1.0782145261764526, + "learning_rate": 1.2783505154639176e-05, + "loss": 1.5097, + "step": 62 + }, + { + "epoch": 0.12966297916130692, + "grad_norm": 1.154104232788086, + "learning_rate": 1.2989690721649485e-05, + "loss": 1.5472, + "step": 63 + }, + { + "epoch": 0.13172112168767688, + "grad_norm": 1.1614656448364258, + "learning_rate": 1.3195876288659795e-05, + "loss": 1.4833, + "step": 64 + }, + { + "epoch": 0.13377926421404682, + "grad_norm": 1.1720911264419556, + "learning_rate": 1.3402061855670103e-05, + "loss": 1.4644, + "step": 65 + }, + { + "epoch": 0.13583740674041678, + "grad_norm": 1.8903896808624268, + "learning_rate": 1.3608247422680415e-05, + "loss": 1.4286, + "step": 66 + }, + { + "epoch": 0.13789554926678674, + "grad_norm": 1.2675013542175293, + "learning_rate": 1.3814432989690723e-05, + "loss": 1.416, + "step": 67 + }, + { + "epoch": 0.13995369179315667, + "grad_norm": 1.266434907913208, + "learning_rate": 1.402061855670103e-05, + "loss": 1.3171, + "step": 68 + }, + { + "epoch": 0.14201183431952663, + "grad_norm": 1.3408889770507812, + "learning_rate": 1.4226804123711342e-05, + "loss": 1.3396, + "step": 69 + }, + { + "epoch": 0.1440699768458966, + "grad_norm": 1.3862446546554565, + "learning_rate": 1.443298969072165e-05, + "loss": 1.2642, + "step": 70 + }, + { + "epoch": 0.14612811937226652, + "grad_norm": 2.110553026199341, + "learning_rate": 1.4639175257731958e-05, + "loss": 1.2593, + "step": 71 + }, + { + "epoch": 0.14818626189863648, + "grad_norm": 1.7017499208450317, + "learning_rate": 1.484536082474227e-05, + "loss": 1.24, + "step": 72 + }, + { + "epoch": 0.15024440442500644, + "grad_norm": 1.9851700067520142, + "learning_rate": 1.5051546391752578e-05, + "loss": 1.2313, + "step": 73 + }, + { + "epoch": 0.15230254695137638, + "grad_norm": 2.009608030319214, + "learning_rate": 1.5257731958762888e-05, + "loss": 1.1281, + "step": 74 + }, + { + "epoch": 0.15436068947774634, + "grad_norm": 2.7587485313415527, + "learning_rate": 1.5463917525773197e-05, + "loss": 1.1248, + "step": 75 + }, + { + "epoch": 0.1564188320041163, + "grad_norm": 2.780954599380493, + "learning_rate": 1.5670103092783507e-05, + "loss": 1.0797, + "step": 76 + }, + { + "epoch": 0.15847697453048623, + "grad_norm": 3.1470866203308105, + "learning_rate": 1.5876288659793813e-05, + "loss": 1.0064, + "step": 77 + }, + { + "epoch": 0.1605351170568562, + "grad_norm": 4.653595447540283, + "learning_rate": 1.6082474226804127e-05, + "loss": 0.9219, + "step": 78 + }, + { + "epoch": 0.16259325958322615, + "grad_norm": 4.157363414764404, + "learning_rate": 1.6288659793814433e-05, + "loss": 0.8709, + "step": 79 + }, + { + "epoch": 0.16465140210959608, + "grad_norm": 4.5814924240112305, + "learning_rate": 1.6494845360824743e-05, + "loss": 0.7693, + "step": 80 + }, + { + "epoch": 0.16670954463596604, + "grad_norm": 5.096139907836914, + "learning_rate": 1.6701030927835052e-05, + "loss": 0.6868, + "step": 81 + }, + { + "epoch": 0.168767687162336, + "grad_norm": 4.858880519866943, + "learning_rate": 1.6907216494845362e-05, + "loss": 0.5971, + "step": 82 + }, + { + "epoch": 0.17082582968870594, + "grad_norm": 4.42564582824707, + "learning_rate": 1.7113402061855672e-05, + "loss": 0.4719, + "step": 83 + }, + { + "epoch": 0.1728839722150759, + "grad_norm": 7.720851421356201, + "learning_rate": 1.731958762886598e-05, + "loss": 0.3943, + "step": 84 + }, + { + "epoch": 0.17494211474144586, + "grad_norm": 0.41923192143440247, + "learning_rate": 1.752577319587629e-05, + "loss": 0.3635, + "step": 85 + }, + { + "epoch": 0.1770002572678158, + "grad_norm": 0.2771846354007721, + "learning_rate": 1.7731958762886598e-05, + "loss": 0.3597, + "step": 86 + }, + { + "epoch": 0.17905839979418575, + "grad_norm": 0.24761857092380524, + "learning_rate": 1.793814432989691e-05, + "loss": 0.3735, + "step": 87 + }, + { + "epoch": 0.1811165423205557, + "grad_norm": 0.23277048766613007, + "learning_rate": 1.8144329896907217e-05, + "loss": 0.3643, + "step": 88 + }, + { + "epoch": 0.18317468484692565, + "grad_norm": 0.22931228578090668, + "learning_rate": 1.8350515463917527e-05, + "loss": 0.3519, + "step": 89 + }, + { + "epoch": 0.1852328273732956, + "grad_norm": 0.20750615000724792, + "learning_rate": 1.8556701030927837e-05, + "loss": 0.3431, + "step": 90 + }, + { + "epoch": 0.18729096989966554, + "grad_norm": 0.2080322951078415, + "learning_rate": 1.8762886597938147e-05, + "loss": 0.3632, + "step": 91 + }, + { + "epoch": 0.1893491124260355, + "grad_norm": 0.20186181366443634, + "learning_rate": 1.8969072164948453e-05, + "loss": 0.3492, + "step": 92 + }, + { + "epoch": 0.19140725495240546, + "grad_norm": 0.19172786176204681, + "learning_rate": 1.9175257731958766e-05, + "loss": 0.3552, + "step": 93 + }, + { + "epoch": 0.1934653974787754, + "grad_norm": 0.1747850626707077, + "learning_rate": 1.9381443298969072e-05, + "loss": 0.3355, + "step": 94 + }, + { + "epoch": 0.19552354000514535, + "grad_norm": 0.196411594748497, + "learning_rate": 1.9587628865979382e-05, + "loss": 0.3271, + "step": 95 + }, + { + "epoch": 0.1975816825315153, + "grad_norm": 0.20063228905200958, + "learning_rate": 1.9793814432989692e-05, + "loss": 0.3351, + "step": 96 + }, + { + "epoch": 0.19963982505788525, + "grad_norm": 0.19240939617156982, + "learning_rate": 2e-05, + "loss": 0.3266, + "step": 97 + }, + { + "epoch": 0.2016979675842552, + "grad_norm": 0.18206572532653809, + "learning_rate": 1.997709049255441e-05, + "loss": 0.3393, + "step": 98 + }, + { + "epoch": 0.20375611011062517, + "grad_norm": 0.20384562015533447, + "learning_rate": 1.9954180985108823e-05, + "loss": 0.3395, + "step": 99 + }, + { + "epoch": 0.2058142526369951, + "grad_norm": 0.19944581389427185, + "learning_rate": 1.9931271477663232e-05, + "loss": 0.3268, + "step": 100 + }, + { + "epoch": 0.2058142526369951, + "eval_loss": 0.3456890285015106, + "eval_runtime": 2114.0178, + "eval_samples_per_second": 3.677, + "eval_steps_per_second": 0.92, + "step": 100 + }, + { + "epoch": 0.20787239516336506, + "grad_norm": 0.17743557691574097, + "learning_rate": 1.990836197021764e-05, + "loss": 0.3439, + "step": 101 + }, + { + "epoch": 0.20993053768973502, + "grad_norm": 0.18746449053287506, + "learning_rate": 1.9885452462772053e-05, + "loss": 0.326, + "step": 102 + }, + { + "epoch": 0.21198868021610495, + "grad_norm": 0.18555815517902374, + "learning_rate": 1.9862542955326462e-05, + "loss": 0.3337, + "step": 103 + }, + { + "epoch": 0.2140468227424749, + "grad_norm": 0.16591575741767883, + "learning_rate": 1.9839633447880874e-05, + "loss": 0.3121, + "step": 104 + }, + { + "epoch": 0.21610496526884487, + "grad_norm": 0.1621987372636795, + "learning_rate": 1.9816723940435283e-05, + "loss": 0.3287, + "step": 105 + }, + { + "epoch": 0.2181631077952148, + "grad_norm": 0.1614532470703125, + "learning_rate": 1.9793814432989692e-05, + "loss": 0.3306, + "step": 106 + }, + { + "epoch": 0.22022125032158477, + "grad_norm": 0.17993387579917908, + "learning_rate": 1.9770904925544104e-05, + "loss": 0.3341, + "step": 107 + }, + { + "epoch": 0.22227939284795473, + "grad_norm": 0.1550011783838272, + "learning_rate": 1.9747995418098513e-05, + "loss": 0.3197, + "step": 108 + }, + { + "epoch": 0.22433753537432466, + "grad_norm": 0.18471524119377136, + "learning_rate": 1.9725085910652922e-05, + "loss": 0.3285, + "step": 109 + }, + { + "epoch": 0.22639567790069462, + "grad_norm": 0.15604373812675476, + "learning_rate": 1.9702176403207334e-05, + "loss": 0.3298, + "step": 110 + }, + { + "epoch": 0.22845382042706458, + "grad_norm": 0.1682298630475998, + "learning_rate": 1.9679266895761743e-05, + "loss": 0.3343, + "step": 111 + }, + { + "epoch": 0.2305119629534345, + "grad_norm": 0.14933635294437408, + "learning_rate": 1.9656357388316152e-05, + "loss": 0.3134, + "step": 112 + }, + { + "epoch": 0.23257010547980447, + "grad_norm": 0.14892347157001495, + "learning_rate": 1.963344788087056e-05, + "loss": 0.3154, + "step": 113 + }, + { + "epoch": 0.23462824800617443, + "grad_norm": 0.1577889323234558, + "learning_rate": 1.9610538373424973e-05, + "loss": 0.3122, + "step": 114 + }, + { + "epoch": 0.23668639053254437, + "grad_norm": 0.16482344269752502, + "learning_rate": 1.9587628865979382e-05, + "loss": 0.3193, + "step": 115 + }, + { + "epoch": 0.23874453305891433, + "grad_norm": 0.15328913927078247, + "learning_rate": 1.956471935853379e-05, + "loss": 0.3217, + "step": 116 + }, + { + "epoch": 0.2408026755852843, + "grad_norm": 0.16140656173229218, + "learning_rate": 1.9541809851088203e-05, + "loss": 0.318, + "step": 117 + }, + { + "epoch": 0.24286081811165422, + "grad_norm": 0.15448373556137085, + "learning_rate": 1.9518900343642612e-05, + "loss": 0.3205, + "step": 118 + }, + { + "epoch": 0.24491896063802418, + "grad_norm": 0.14716887474060059, + "learning_rate": 1.9495990836197025e-05, + "loss": 0.3164, + "step": 119 + }, + { + "epoch": 0.24697710316439414, + "grad_norm": 0.16582027077674866, + "learning_rate": 1.9473081328751433e-05, + "loss": 0.3191, + "step": 120 + }, + { + "epoch": 0.24903524569076407, + "grad_norm": 0.15213699638843536, + "learning_rate": 1.9450171821305842e-05, + "loss": 0.304, + "step": 121 + }, + { + "epoch": 0.25109338821713406, + "grad_norm": 0.1659238487482071, + "learning_rate": 1.9427262313860255e-05, + "loss": 0.3184, + "step": 122 + }, + { + "epoch": 0.253151530743504, + "grad_norm": 0.15596656501293182, + "learning_rate": 1.9404352806414663e-05, + "loss": 0.3092, + "step": 123 + }, + { + "epoch": 0.2552096732698739, + "grad_norm": 0.15868476033210754, + "learning_rate": 1.9381443298969072e-05, + "loss": 0.3163, + "step": 124 + }, + { + "epoch": 0.2572678157962439, + "grad_norm": 0.15386095643043518, + "learning_rate": 1.9358533791523485e-05, + "loss": 0.3049, + "step": 125 + }, + { + "epoch": 0.25932595832261385, + "grad_norm": 0.15179213881492615, + "learning_rate": 1.9335624284077894e-05, + "loss": 0.3131, + "step": 126 + }, + { + "epoch": 0.2613841008489838, + "grad_norm": 0.1595134735107422, + "learning_rate": 1.9312714776632306e-05, + "loss": 0.3069, + "step": 127 + }, + { + "epoch": 0.26344224337535377, + "grad_norm": 0.16989803314208984, + "learning_rate": 1.9289805269186715e-05, + "loss": 0.3052, + "step": 128 + }, + { + "epoch": 0.2655003859017237, + "grad_norm": 0.14803892374038696, + "learning_rate": 1.9266895761741124e-05, + "loss": 0.3065, + "step": 129 + }, + { + "epoch": 0.26755852842809363, + "grad_norm": 0.16676583886146545, + "learning_rate": 1.9243986254295536e-05, + "loss": 0.2962, + "step": 130 + }, + { + "epoch": 0.2696166709544636, + "grad_norm": 0.15694552659988403, + "learning_rate": 1.9221076746849945e-05, + "loss": 0.3096, + "step": 131 + }, + { + "epoch": 0.27167481348083355, + "grad_norm": 0.17696696519851685, + "learning_rate": 1.9198167239404354e-05, + "loss": 0.3145, + "step": 132 + }, + { + "epoch": 0.2737329560072035, + "grad_norm": 0.17204038798809052, + "learning_rate": 1.9175257731958766e-05, + "loss": 0.3248, + "step": 133 + }, + { + "epoch": 0.2757910985335735, + "grad_norm": 0.15630359947681427, + "learning_rate": 1.9152348224513175e-05, + "loss": 0.3117, + "step": 134 + }, + { + "epoch": 0.2778492410599434, + "grad_norm": 0.15757997334003448, + "learning_rate": 1.9129438717067584e-05, + "loss": 0.3145, + "step": 135 + }, + { + "epoch": 0.27990738358631334, + "grad_norm": 0.16273653507232666, + "learning_rate": 1.9106529209621996e-05, + "loss": 0.3159, + "step": 136 + }, + { + "epoch": 0.28196552611268333, + "grad_norm": 0.16213104128837585, + "learning_rate": 1.9083619702176405e-05, + "loss": 0.2949, + "step": 137 + }, + { + "epoch": 0.28402366863905326, + "grad_norm": 0.15377865731716156, + "learning_rate": 1.9060710194730814e-05, + "loss": 0.306, + "step": 138 + }, + { + "epoch": 0.2860818111654232, + "grad_norm": 0.1545962244272232, + "learning_rate": 1.9037800687285223e-05, + "loss": 0.2966, + "step": 139 + }, + { + "epoch": 0.2881399536917932, + "grad_norm": 0.15516617894172668, + "learning_rate": 1.9014891179839635e-05, + "loss": 0.3122, + "step": 140 + }, + { + "epoch": 0.2901980962181631, + "grad_norm": 0.14734458923339844, + "learning_rate": 1.8991981672394044e-05, + "loss": 0.3118, + "step": 141 + }, + { + "epoch": 0.29225623874453305, + "grad_norm": 0.1644304096698761, + "learning_rate": 1.8969072164948453e-05, + "loss": 0.3027, + "step": 142 + }, + { + "epoch": 0.29431438127090304, + "grad_norm": 0.14632569253444672, + "learning_rate": 1.8946162657502865e-05, + "loss": 0.3023, + "step": 143 + }, + { + "epoch": 0.29637252379727297, + "grad_norm": 0.1573137789964676, + "learning_rate": 1.8923253150057274e-05, + "loss": 0.3102, + "step": 144 + }, + { + "epoch": 0.2984306663236429, + "grad_norm": 0.16423144936561584, + "learning_rate": 1.8900343642611686e-05, + "loss": 0.3033, + "step": 145 + }, + { + "epoch": 0.3004888088500129, + "grad_norm": 0.15420907735824585, + "learning_rate": 1.8877434135166095e-05, + "loss": 0.3089, + "step": 146 + }, + { + "epoch": 0.3025469513763828, + "grad_norm": 0.1579178273677826, + "learning_rate": 1.8854524627720504e-05, + "loss": 0.3071, + "step": 147 + }, + { + "epoch": 0.30460509390275275, + "grad_norm": 0.15866397321224213, + "learning_rate": 1.8831615120274916e-05, + "loss": 0.3083, + "step": 148 + }, + { + "epoch": 0.30666323642912274, + "grad_norm": 0.16651487350463867, + "learning_rate": 1.8808705612829325e-05, + "loss": 0.3099, + "step": 149 + }, + { + "epoch": 0.3087213789554927, + "grad_norm": 0.16281908750534058, + "learning_rate": 1.8785796105383734e-05, + "loss": 0.3034, + "step": 150 + }, + { + "epoch": 0.3107795214818626, + "grad_norm": 0.17449837923049927, + "learning_rate": 1.8762886597938147e-05, + "loss": 0.3054, + "step": 151 + }, + { + "epoch": 0.3128376640082326, + "grad_norm": 0.15403546392917633, + "learning_rate": 1.8739977090492555e-05, + "loss": 0.297, + "step": 152 + }, + { + "epoch": 0.31489580653460253, + "grad_norm": 0.1472466140985489, + "learning_rate": 1.8717067583046968e-05, + "loss": 0.2973, + "step": 153 + }, + { + "epoch": 0.31695394906097246, + "grad_norm": 0.16027937829494476, + "learning_rate": 1.8694158075601377e-05, + "loss": 0.3054, + "step": 154 + }, + { + "epoch": 0.31901209158734245, + "grad_norm": 0.17086225748062134, + "learning_rate": 1.8671248568155786e-05, + "loss": 0.307, + "step": 155 + }, + { + "epoch": 0.3210702341137124, + "grad_norm": 0.15930697321891785, + "learning_rate": 1.8648339060710198e-05, + "loss": 0.293, + "step": 156 + }, + { + "epoch": 0.3231283766400823, + "grad_norm": 0.17086376249790192, + "learning_rate": 1.8625429553264607e-05, + "loss": 0.293, + "step": 157 + }, + { + "epoch": 0.3251865191664523, + "grad_norm": 0.15970875322818756, + "learning_rate": 1.8602520045819016e-05, + "loss": 0.3083, + "step": 158 + }, + { + "epoch": 0.32724466169282224, + "grad_norm": 0.16355909407138824, + "learning_rate": 1.8579610538373428e-05, + "loss": 0.3139, + "step": 159 + }, + { + "epoch": 0.32930280421919217, + "grad_norm": 0.15183711051940918, + "learning_rate": 1.8556701030927837e-05, + "loss": 0.2953, + "step": 160 + }, + { + "epoch": 0.33136094674556216, + "grad_norm": 0.15123715996742249, + "learning_rate": 1.853379152348225e-05, + "loss": 0.3025, + "step": 161 + }, + { + "epoch": 0.3334190892719321, + "grad_norm": 0.1576143503189087, + "learning_rate": 1.8510882016036658e-05, + "loss": 0.2904, + "step": 162 + }, + { + "epoch": 0.335477231798302, + "grad_norm": 0.1457504779100418, + "learning_rate": 1.8487972508591067e-05, + "loss": 0.2909, + "step": 163 + }, + { + "epoch": 0.337535374324672, + "grad_norm": 0.1557442992925644, + "learning_rate": 1.846506300114548e-05, + "loss": 0.3027, + "step": 164 + }, + { + "epoch": 0.33959351685104194, + "grad_norm": 0.15662318468093872, + "learning_rate": 1.8442153493699888e-05, + "loss": 0.311, + "step": 165 + }, + { + "epoch": 0.3416516593774119, + "grad_norm": 0.16177058219909668, + "learning_rate": 1.8419243986254297e-05, + "loss": 0.2944, + "step": 166 + }, + { + "epoch": 0.34370980190378186, + "grad_norm": 0.16406729817390442, + "learning_rate": 1.8396334478808706e-05, + "loss": 0.2927, + "step": 167 + }, + { + "epoch": 0.3457679444301518, + "grad_norm": 0.16642791032791138, + "learning_rate": 1.8373424971363115e-05, + "loss": 0.3063, + "step": 168 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.1650693714618683, + "learning_rate": 1.8350515463917527e-05, + "loss": 0.2957, + "step": 169 + }, + { + "epoch": 0.3498842294828917, + "grad_norm": 0.15349675714969635, + "learning_rate": 1.8327605956471936e-05, + "loss": 0.297, + "step": 170 + }, + { + "epoch": 0.35194237200926165, + "grad_norm": 0.17770209908485413, + "learning_rate": 1.8304696449026348e-05, + "loss": 0.3011, + "step": 171 + }, + { + "epoch": 0.3540005145356316, + "grad_norm": 0.1647631675004959, + "learning_rate": 1.8281786941580757e-05, + "loss": 0.2962, + "step": 172 + }, + { + "epoch": 0.35605865706200157, + "grad_norm": 0.1603834480047226, + "learning_rate": 1.8258877434135166e-05, + "loss": 0.2937, + "step": 173 + }, + { + "epoch": 0.3581167995883715, + "grad_norm": 0.16780880093574524, + "learning_rate": 1.8235967926689578e-05, + "loss": 0.2997, + "step": 174 + }, + { + "epoch": 0.36017494211474144, + "grad_norm": 0.15976767241954803, + "learning_rate": 1.8213058419243987e-05, + "loss": 0.3043, + "step": 175 + }, + { + "epoch": 0.3622330846411114, + "grad_norm": 0.16236485540866852, + "learning_rate": 1.8190148911798396e-05, + "loss": 0.3069, + "step": 176 + }, + { + "epoch": 0.36429122716748136, + "grad_norm": 0.16391968727111816, + "learning_rate": 1.816723940435281e-05, + "loss": 0.2923, + "step": 177 + }, + { + "epoch": 0.3663493696938513, + "grad_norm": 0.15806889533996582, + "learning_rate": 1.8144329896907217e-05, + "loss": 0.2872, + "step": 178 + }, + { + "epoch": 0.3684075122202212, + "grad_norm": 0.1627352088689804, + "learning_rate": 1.812142038946163e-05, + "loss": 0.3032, + "step": 179 + }, + { + "epoch": 0.3704656547465912, + "grad_norm": 0.15103371441364288, + "learning_rate": 1.809851088201604e-05, + "loss": 0.2847, + "step": 180 + }, + { + "epoch": 0.37252379727296114, + "grad_norm": 0.15178488194942474, + "learning_rate": 1.8075601374570447e-05, + "loss": 0.3017, + "step": 181 + }, + { + "epoch": 0.3745819397993311, + "grad_norm": 0.15493899583816528, + "learning_rate": 1.805269186712486e-05, + "loss": 0.2901, + "step": 182 + }, + { + "epoch": 0.37664008232570106, + "grad_norm": 0.15990686416625977, + "learning_rate": 1.802978235967927e-05, + "loss": 0.2861, + "step": 183 + }, + { + "epoch": 0.378698224852071, + "grad_norm": 0.15824148058891296, + "learning_rate": 1.8006872852233677e-05, + "loss": 0.2885, + "step": 184 + }, + { + "epoch": 0.38075636737844093, + "grad_norm": 0.15690775215625763, + "learning_rate": 1.798396334478809e-05, + "loss": 0.2814, + "step": 185 + }, + { + "epoch": 0.3828145099048109, + "grad_norm": 0.15833796560764313, + "learning_rate": 1.79610538373425e-05, + "loss": 0.2847, + "step": 186 + }, + { + "epoch": 0.38487265243118085, + "grad_norm": 0.16560044884681702, + "learning_rate": 1.793814432989691e-05, + "loss": 0.3061, + "step": 187 + }, + { + "epoch": 0.3869307949575508, + "grad_norm": 0.16240179538726807, + "learning_rate": 1.791523482245132e-05, + "loss": 0.2943, + "step": 188 + }, + { + "epoch": 0.38898893748392077, + "grad_norm": 0.15825721621513367, + "learning_rate": 1.789232531500573e-05, + "loss": 0.2934, + "step": 189 + }, + { + "epoch": 0.3910470800102907, + "grad_norm": 0.16665388643741608, + "learning_rate": 1.786941580756014e-05, + "loss": 0.291, + "step": 190 + }, + { + "epoch": 0.39310522253666064, + "grad_norm": 0.16581200063228607, + "learning_rate": 1.784650630011455e-05, + "loss": 0.2849, + "step": 191 + }, + { + "epoch": 0.3951633650630306, + "grad_norm": 0.1604345291852951, + "learning_rate": 1.782359679266896e-05, + "loss": 0.3, + "step": 192 + }, + { + "epoch": 0.39722150758940056, + "grad_norm": 0.16107915341854095, + "learning_rate": 1.7800687285223368e-05, + "loss": 0.2847, + "step": 193 + }, + { + "epoch": 0.3992796501157705, + "grad_norm": 0.1571730375289917, + "learning_rate": 1.7777777777777777e-05, + "loss": 0.2863, + "step": 194 + }, + { + "epoch": 0.4013377926421405, + "grad_norm": 0.1656399518251419, + "learning_rate": 1.775486827033219e-05, + "loss": 0.2878, + "step": 195 + }, + { + "epoch": 0.4033959351685104, + "grad_norm": 0.16738460958003998, + "learning_rate": 1.7731958762886598e-05, + "loss": 0.286, + "step": 196 + }, + { + "epoch": 0.40545407769488034, + "grad_norm": 0.16704292595386505, + "learning_rate": 1.770904925544101e-05, + "loss": 0.2919, + "step": 197 + }, + { + "epoch": 0.40751222022125033, + "grad_norm": 0.16215579211711884, + "learning_rate": 1.768613974799542e-05, + "loss": 0.2874, + "step": 198 + }, + { + "epoch": 0.40957036274762026, + "grad_norm": 0.15573479235172272, + "learning_rate": 1.7663230240549828e-05, + "loss": 0.2904, + "step": 199 + }, + { + "epoch": 0.4116285052739902, + "grad_norm": 0.1707623153924942, + "learning_rate": 1.764032073310424e-05, + "loss": 0.289, + "step": 200 + }, + { + "epoch": 0.4116285052739902, + "eval_loss": 0.3214050829410553, + "eval_runtime": 2449.7742, + "eval_samples_per_second": 3.173, + "eval_steps_per_second": 0.794, + "step": 200 + }, + { + "epoch": 0.4136866478003602, + "grad_norm": 0.1699172556400299, + "learning_rate": 1.761741122565865e-05, + "loss": 0.2852, + "step": 201 + }, + { + "epoch": 0.4157447903267301, + "grad_norm": 0.19150058925151825, + "learning_rate": 1.7594501718213058e-05, + "loss": 0.29, + "step": 202 + }, + { + "epoch": 0.41780293285310005, + "grad_norm": 0.15794627368450165, + "learning_rate": 1.757159221076747e-05, + "loss": 0.2746, + "step": 203 + }, + { + "epoch": 0.41986107537947004, + "grad_norm": 0.17305190861225128, + "learning_rate": 1.754868270332188e-05, + "loss": 0.3003, + "step": 204 + }, + { + "epoch": 0.42191921790583997, + "grad_norm": 0.16257523000240326, + "learning_rate": 1.752577319587629e-05, + "loss": 0.2789, + "step": 205 + }, + { + "epoch": 0.4239773604322099, + "grad_norm": 0.17273619771003723, + "learning_rate": 1.75028636884307e-05, + "loss": 0.2917, + "step": 206 + }, + { + "epoch": 0.4260355029585799, + "grad_norm": 0.17502790689468384, + "learning_rate": 1.747995418098511e-05, + "loss": 0.2992, + "step": 207 + }, + { + "epoch": 0.4280936454849498, + "grad_norm": 0.16464050114154816, + "learning_rate": 1.745704467353952e-05, + "loss": 0.2873, + "step": 208 + }, + { + "epoch": 0.43015178801131976, + "grad_norm": 0.1681668758392334, + "learning_rate": 1.743413516609393e-05, + "loss": 0.2991, + "step": 209 + }, + { + "epoch": 0.43220993053768975, + "grad_norm": 0.16957956552505493, + "learning_rate": 1.741122565864834e-05, + "loss": 0.2868, + "step": 210 + }, + { + "epoch": 0.4342680730640597, + "grad_norm": 0.15875883400440216, + "learning_rate": 1.738831615120275e-05, + "loss": 0.2946, + "step": 211 + }, + { + "epoch": 0.4363262155904296, + "grad_norm": 0.18127889931201935, + "learning_rate": 1.736540664375716e-05, + "loss": 0.2835, + "step": 212 + }, + { + "epoch": 0.4383843581167996, + "grad_norm": 0.17822811007499695, + "learning_rate": 1.7342497136311573e-05, + "loss": 0.2944, + "step": 213 + }, + { + "epoch": 0.44044250064316953, + "grad_norm": 0.17555806040763855, + "learning_rate": 1.731958762886598e-05, + "loss": 0.3001, + "step": 214 + }, + { + "epoch": 0.44250064316953946, + "grad_norm": 0.18709121644496918, + "learning_rate": 1.729667812142039e-05, + "loss": 0.282, + "step": 215 + }, + { + "epoch": 0.44455878569590945, + "grad_norm": 0.16322475671768188, + "learning_rate": 1.7273768613974803e-05, + "loss": 0.2883, + "step": 216 + }, + { + "epoch": 0.4466169282222794, + "grad_norm": 0.1677054911851883, + "learning_rate": 1.7250859106529212e-05, + "loss": 0.28, + "step": 217 + }, + { + "epoch": 0.4486750707486493, + "grad_norm": 0.15764063596725464, + "learning_rate": 1.722794959908362e-05, + "loss": 0.2768, + "step": 218 + }, + { + "epoch": 0.4507332132750193, + "grad_norm": 0.16166841983795166, + "learning_rate": 1.7205040091638033e-05, + "loss": 0.2868, + "step": 219 + }, + { + "epoch": 0.45279135580138924, + "grad_norm": 0.1799350380897522, + "learning_rate": 1.7182130584192442e-05, + "loss": 0.2891, + "step": 220 + }, + { + "epoch": 0.45484949832775917, + "grad_norm": 0.18119174242019653, + "learning_rate": 1.715922107674685e-05, + "loss": 0.2841, + "step": 221 + }, + { + "epoch": 0.45690764085412916, + "grad_norm": 0.17725548148155212, + "learning_rate": 1.713631156930126e-05, + "loss": 0.3038, + "step": 222 + }, + { + "epoch": 0.4589657833804991, + "grad_norm": 0.1628233790397644, + "learning_rate": 1.7113402061855672e-05, + "loss": 0.2868, + "step": 223 + }, + { + "epoch": 0.461023925906869, + "grad_norm": 0.1745166927576065, + "learning_rate": 1.709049255441008e-05, + "loss": 0.3033, + "step": 224 + }, + { + "epoch": 0.463082068433239, + "grad_norm": 0.17708267271518707, + "learning_rate": 1.706758304696449e-05, + "loss": 0.2842, + "step": 225 + }, + { + "epoch": 0.46514021095960895, + "grad_norm": 0.1738453358411789, + "learning_rate": 1.7044673539518902e-05, + "loss": 0.3005, + "step": 226 + }, + { + "epoch": 0.4671983534859789, + "grad_norm": 0.1706874966621399, + "learning_rate": 1.702176403207331e-05, + "loss": 0.2924, + "step": 227 + }, + { + "epoch": 0.46925649601234887, + "grad_norm": 0.1697423756122589, + "learning_rate": 1.699885452462772e-05, + "loss": 0.2783, + "step": 228 + }, + { + "epoch": 0.4713146385387188, + "grad_norm": 0.1783403754234314, + "learning_rate": 1.6975945017182132e-05, + "loss": 0.2924, + "step": 229 + }, + { + "epoch": 0.47337278106508873, + "grad_norm": 0.17431536316871643, + "learning_rate": 1.695303550973654e-05, + "loss": 0.2792, + "step": 230 + }, + { + "epoch": 0.4754309235914587, + "grad_norm": 0.164026141166687, + "learning_rate": 1.6930126002290953e-05, + "loss": 0.2825, + "step": 231 + }, + { + "epoch": 0.47748906611782865, + "grad_norm": 0.16449657082557678, + "learning_rate": 1.6907216494845362e-05, + "loss": 0.2831, + "step": 232 + }, + { + "epoch": 0.4795472086441986, + "grad_norm": 0.1812741607427597, + "learning_rate": 1.688430698739977e-05, + "loss": 0.2849, + "step": 233 + }, + { + "epoch": 0.4816053511705686, + "grad_norm": 0.18431834876537323, + "learning_rate": 1.6861397479954183e-05, + "loss": 0.2802, + "step": 234 + }, + { + "epoch": 0.4836634936969385, + "grad_norm": 0.18349015712738037, + "learning_rate": 1.6838487972508592e-05, + "loss": 0.2804, + "step": 235 + }, + { + "epoch": 0.48572163622330844, + "grad_norm": 0.1769968420267105, + "learning_rate": 1.6815578465063e-05, + "loss": 0.2777, + "step": 236 + }, + { + "epoch": 0.4877797787496784, + "grad_norm": 0.17207500338554382, + "learning_rate": 1.6792668957617413e-05, + "loss": 0.2883, + "step": 237 + }, + { + "epoch": 0.48983792127604836, + "grad_norm": 0.1729692667722702, + "learning_rate": 1.6769759450171822e-05, + "loss": 0.2784, + "step": 238 + }, + { + "epoch": 0.4918960638024183, + "grad_norm": 0.17234881222248077, + "learning_rate": 1.6746849942726235e-05, + "loss": 0.2816, + "step": 239 + }, + { + "epoch": 0.4939542063287883, + "grad_norm": 0.17132551968097687, + "learning_rate": 1.6723940435280644e-05, + "loss": 0.2812, + "step": 240 + }, + { + "epoch": 0.4960123488551582, + "grad_norm": 0.1752254068851471, + "learning_rate": 1.6701030927835052e-05, + "loss": 0.2799, + "step": 241 + }, + { + "epoch": 0.49807049138152815, + "grad_norm": 0.1768665313720703, + "learning_rate": 1.6678121420389465e-05, + "loss": 0.2966, + "step": 242 + }, + { + "epoch": 0.5001286339078981, + "grad_norm": 0.18139514327049255, + "learning_rate": 1.6655211912943874e-05, + "loss": 0.2816, + "step": 243 + }, + { + "epoch": 0.5021867764342681, + "grad_norm": 0.17312943935394287, + "learning_rate": 1.6632302405498283e-05, + "loss": 0.2845, + "step": 244 + }, + { + "epoch": 0.5042449189606381, + "grad_norm": 0.17966389656066895, + "learning_rate": 1.6609392898052695e-05, + "loss": 0.2864, + "step": 245 + }, + { + "epoch": 0.506303061487008, + "grad_norm": 0.16653811931610107, + "learning_rate": 1.6586483390607104e-05, + "loss": 0.2759, + "step": 246 + }, + { + "epoch": 0.5083612040133779, + "grad_norm": 0.1634613424539566, + "learning_rate": 1.6563573883161516e-05, + "loss": 0.2728, + "step": 247 + }, + { + "epoch": 0.5104193465397479, + "grad_norm": 0.17358507215976715, + "learning_rate": 1.654066437571592e-05, + "loss": 0.2706, + "step": 248 + }, + { + "epoch": 0.5124774890661178, + "grad_norm": 0.17524316906929016, + "learning_rate": 1.6517754868270334e-05, + "loss": 0.2805, + "step": 249 + }, + { + "epoch": 0.5145356315924878, + "grad_norm": 0.18134094774723053, + "learning_rate": 1.6494845360824743e-05, + "loss": 0.2909, + "step": 250 + }, + { + "epoch": 0.5165937741188578, + "grad_norm": 0.17795510590076447, + "learning_rate": 1.647193585337915e-05, + "loss": 0.2889, + "step": 251 + }, + { + "epoch": 0.5186519166452277, + "grad_norm": 0.16782547533512115, + "learning_rate": 1.6449026345933564e-05, + "loss": 0.2842, + "step": 252 + }, + { + "epoch": 0.5207100591715976, + "grad_norm": 0.17360062897205353, + "learning_rate": 1.6426116838487973e-05, + "loss": 0.2763, + "step": 253 + }, + { + "epoch": 0.5227682016979676, + "grad_norm": 0.17241406440734863, + "learning_rate": 1.6403207331042385e-05, + "loss": 0.2753, + "step": 254 + }, + { + "epoch": 0.5248263442243375, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.6380297823596794e-05, + "loss": 0.2732, + "step": 255 + }, + { + "epoch": 0.5268844867507075, + "grad_norm": 0.1807374209165573, + "learning_rate": 1.6357388316151203e-05, + "loss": 0.2856, + "step": 256 + }, + { + "epoch": 0.5289426292770775, + "grad_norm": 0.1749904304742813, + "learning_rate": 1.6334478808705615e-05, + "loss": 0.285, + "step": 257 + }, + { + "epoch": 0.5310007718034474, + "grad_norm": 0.16673170030117035, + "learning_rate": 1.6311569301260024e-05, + "loss": 0.2825, + "step": 258 + }, + { + "epoch": 0.5330589143298173, + "grad_norm": 0.17239685356616974, + "learning_rate": 1.6288659793814433e-05, + "loss": 0.2845, + "step": 259 + }, + { + "epoch": 0.5351170568561873, + "grad_norm": 0.1831504851579666, + "learning_rate": 1.6265750286368845e-05, + "loss": 0.2859, + "step": 260 + }, + { + "epoch": 0.5371751993825572, + "grad_norm": 0.18507827818393707, + "learning_rate": 1.6242840778923254e-05, + "loss": 0.293, + "step": 261 + }, + { + "epoch": 0.5392333419089272, + "grad_norm": 0.16738134622573853, + "learning_rate": 1.6219931271477663e-05, + "loss": 0.2853, + "step": 262 + }, + { + "epoch": 0.5412914844352972, + "grad_norm": 0.1701226830482483, + "learning_rate": 1.6197021764032075e-05, + "loss": 0.2763, + "step": 263 + }, + { + "epoch": 0.5433496269616671, + "grad_norm": 0.18195705115795135, + "learning_rate": 1.6174112256586484e-05, + "loss": 0.2797, + "step": 264 + }, + { + "epoch": 0.545407769488037, + "grad_norm": 0.1832309514284134, + "learning_rate": 1.6151202749140896e-05, + "loss": 0.2885, + "step": 265 + }, + { + "epoch": 0.547465912014407, + "grad_norm": 0.1773810088634491, + "learning_rate": 1.6128293241695305e-05, + "loss": 0.2682, + "step": 266 + }, + { + "epoch": 0.5495240545407769, + "grad_norm": 0.16989603638648987, + "learning_rate": 1.6105383734249714e-05, + "loss": 0.2821, + "step": 267 + }, + { + "epoch": 0.551582197067147, + "grad_norm": 0.17835170030593872, + "learning_rate": 1.6082474226804127e-05, + "loss": 0.2774, + "step": 268 + }, + { + "epoch": 0.5536403395935169, + "grad_norm": 0.1777082234621048, + "learning_rate": 1.6059564719358535e-05, + "loss": 0.2726, + "step": 269 + }, + { + "epoch": 0.5556984821198868, + "grad_norm": 0.18766450881958008, + "learning_rate": 1.6036655211912944e-05, + "loss": 0.2879, + "step": 270 + }, + { + "epoch": 0.5577566246462567, + "grad_norm": 0.1868186593055725, + "learning_rate": 1.6013745704467357e-05, + "loss": 0.2808, + "step": 271 + }, + { + "epoch": 0.5598147671726267, + "grad_norm": 0.16695882380008698, + "learning_rate": 1.5990836197021766e-05, + "loss": 0.2668, + "step": 272 + }, + { + "epoch": 0.5618729096989966, + "grad_norm": 0.17224495112895966, + "learning_rate": 1.5967926689576178e-05, + "loss": 0.2682, + "step": 273 + }, + { + "epoch": 0.5639310522253667, + "grad_norm": 0.20116423070430756, + "learning_rate": 1.5945017182130587e-05, + "loss": 0.276, + "step": 274 + }, + { + "epoch": 0.5659891947517366, + "grad_norm": 0.19478343427181244, + "learning_rate": 1.5922107674684996e-05, + "loss": 0.2854, + "step": 275 + }, + { + "epoch": 0.5680473372781065, + "grad_norm": 0.20242950320243835, + "learning_rate": 1.5899198167239405e-05, + "loss": 0.2854, + "step": 276 + }, + { + "epoch": 0.5701054798044765, + "grad_norm": 0.19146093726158142, + "learning_rate": 1.5876288659793813e-05, + "loss": 0.2817, + "step": 277 + }, + { + "epoch": 0.5721636223308464, + "grad_norm": 0.1804896742105484, + "learning_rate": 1.5853379152348226e-05, + "loss": 0.2714, + "step": 278 + }, + { + "epoch": 0.5742217648572163, + "grad_norm": 0.19315646588802338, + "learning_rate": 1.5830469644902635e-05, + "loss": 0.2703, + "step": 279 + }, + { + "epoch": 0.5762799073835864, + "grad_norm": 0.1910266876220703, + "learning_rate": 1.5807560137457047e-05, + "loss": 0.2728, + "step": 280 + }, + { + "epoch": 0.5783380499099563, + "grad_norm": 0.20330773293972015, + "learning_rate": 1.5784650630011456e-05, + "loss": 0.2717, + "step": 281 + }, + { + "epoch": 0.5803961924363262, + "grad_norm": 0.19080683588981628, + "learning_rate": 1.5761741122565865e-05, + "loss": 0.2679, + "step": 282 + }, + { + "epoch": 0.5824543349626962, + "grad_norm": 0.18052135407924652, + "learning_rate": 1.5738831615120277e-05, + "loss": 0.2815, + "step": 283 + }, + { + "epoch": 0.5845124774890661, + "grad_norm": 0.1998361051082611, + "learning_rate": 1.5715922107674686e-05, + "loss": 0.2888, + "step": 284 + }, + { + "epoch": 0.586570620015436, + "grad_norm": 0.1978764683008194, + "learning_rate": 1.5693012600229095e-05, + "loss": 0.2926, + "step": 285 + }, + { + "epoch": 0.5886287625418061, + "grad_norm": 0.17189203202724457, + "learning_rate": 1.5670103092783507e-05, + "loss": 0.2674, + "step": 286 + }, + { + "epoch": 0.590686905068176, + "grad_norm": 0.1937166303396225, + "learning_rate": 1.5647193585337916e-05, + "loss": 0.2838, + "step": 287 + }, + { + "epoch": 0.5927450475945459, + "grad_norm": 0.18978627026081085, + "learning_rate": 1.5624284077892328e-05, + "loss": 0.273, + "step": 288 + }, + { + "epoch": 0.5948031901209159, + "grad_norm": 0.17718705534934998, + "learning_rate": 1.5601374570446737e-05, + "loss": 0.2842, + "step": 289 + }, + { + "epoch": 0.5968613326472858, + "grad_norm": 0.1912536770105362, + "learning_rate": 1.5578465063001146e-05, + "loss": 0.2736, + "step": 290 + }, + { + "epoch": 0.5989194751736557, + "grad_norm": 0.18104907870292664, + "learning_rate": 1.555555555555556e-05, + "loss": 0.274, + "step": 291 + }, + { + "epoch": 0.6009776177000258, + "grad_norm": 0.1620381772518158, + "learning_rate": 1.5532646048109967e-05, + "loss": 0.2663, + "step": 292 + }, + { + "epoch": 0.6030357602263957, + "grad_norm": 0.17973916232585907, + "learning_rate": 1.5509736540664376e-05, + "loss": 0.2791, + "step": 293 + }, + { + "epoch": 0.6050939027527656, + "grad_norm": 0.16821186244487762, + "learning_rate": 1.548682703321879e-05, + "loss": 0.2787, + "step": 294 + }, + { + "epoch": 0.6071520452791356, + "grad_norm": 0.18426693975925446, + "learning_rate": 1.5463917525773197e-05, + "loss": 0.2886, + "step": 295 + }, + { + "epoch": 0.6092101878055055, + "grad_norm": 0.19796033203601837, + "learning_rate": 1.5441008018327606e-05, + "loss": 0.268, + "step": 296 + }, + { + "epoch": 0.6112683303318754, + "grad_norm": 0.1971343755722046, + "learning_rate": 1.541809851088202e-05, + "loss": 0.2761, + "step": 297 + }, + { + "epoch": 0.6133264728582455, + "grad_norm": 0.17458567023277283, + "learning_rate": 1.5395189003436427e-05, + "loss": 0.2831, + "step": 298 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.17610400915145874, + "learning_rate": 1.537227949599084e-05, + "loss": 0.2691, + "step": 299 + }, + { + "epoch": 0.6174427579109854, + "grad_norm": 0.1929042488336563, + "learning_rate": 1.534936998854525e-05, + "loss": 0.2847, + "step": 300 + }, + { + "epoch": 0.6174427579109854, + "eval_loss": 0.2959522604942322, + "eval_runtime": 2428.6339, + "eval_samples_per_second": 3.201, + "eval_steps_per_second": 0.8, + "step": 300 + } + ], + "logging_steps": 1, + "max_steps": 970, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.916186406277612e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/training_args.bin b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5999c7ee9dd10ee9076d748e4757533e635fa832 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee55a11f5a306eb7c39b536fdfe2459bc279e468da50f6adda478c4deffcb812 +size 5688 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/README.md b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0d77d70fdc5c829c8889cb85828736b7eb9714 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/codegemma-7b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/adapter_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e841602c6a59fc7b085ac647af4d4c312445d261 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/codegemma-7b-it", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "q_proj", + "down_proj", + "gate_proj", + "k_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/adapter_model.safetensors b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0d273efd694b7f8f12a34ff4fc7fb45e93ac9b93 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c28d1150755988379f75a873814aff4bb698c6bfe61017321740de287540011d +size 800116456 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/optimizer.pt b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea19d1783b76dfb7fa784ec6f62d0a97b1af1a45 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8eaaabccd9eef6be0dbad53b8764bdb248322edd314f59dc0c08eeaa8b6aaea0 +size 406743860 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/rng_state.pth b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..59b00158305a3b31d900d09514c24abd04f0d915 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6d4fbb4eb2a12b17b73babfa38b90bdafc73a682a0af60498a1e002b1fd5b3a +size 14244 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/scheduler.pt b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3fe79d79a6e7a7ff5bf12aff6b7d708d2051326b --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4268342c3fe77dd38c7543ef5b2cdacbd82e8a3bb5c5cae6fb040e599526f0e +size 1064 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/special_tokens_map.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/tokenizer.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..45a5e23f54141c5f4f97a8d58f3ffadc28e287ba --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d964a2c8346d40f95791533eae48730d5f163c2e65fd16333560fd3e661df318 +size 34362915 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/tokenizer.model b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..71a98ce40269d847e58957e1e070d9ae8eb184af --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:583f2ebd2a1936009b7da991ea255504db68c7a9713a78673d1335a87098966c +size 4241023 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/tokenizer_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9b9b1b4acdd4afcedae39d1cf6f0bc7ef7d9910f --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/tokenizer_config.json @@ -0,0 +1,2011 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "<|file_separator|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "left", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/trainer_state.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e98c4ef0e30723ff68afe00644989df242ec7dcd --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/trainer_state.json @@ -0,0 +1,2866 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8232570105479804, + "eval_steps": 100, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0020581425263699513, + "grad_norm": 11.994463920593262, + "learning_rate": 2.061855670103093e-07, + "loss": 2.91, + "step": 1 + }, + { + "epoch": 0.004116285052739903, + "grad_norm": 11.769092559814453, + "learning_rate": 4.123711340206186e-07, + "loss": 2.8686, + "step": 2 + }, + { + "epoch": 0.0061744275791098535, + "grad_norm": 13.05551815032959, + "learning_rate": 6.185567010309279e-07, + "loss": 3.0286, + "step": 3 + }, + { + "epoch": 0.008232570105479805, + "grad_norm": 12.334521293640137, + "learning_rate": 8.247422680412372e-07, + "loss": 2.904, + "step": 4 + }, + { + "epoch": 0.010290712631849755, + "grad_norm": 12.075353622436523, + "learning_rate": 1.0309278350515464e-06, + "loss": 2.8991, + "step": 5 + }, + { + "epoch": 0.012348855158219707, + "grad_norm": 11.86032485961914, + "learning_rate": 1.2371134020618557e-06, + "loss": 3.0007, + "step": 6 + }, + { + "epoch": 0.014406997684589657, + "grad_norm": 10.10457992553711, + "learning_rate": 1.4432989690721649e-06, + "loss": 2.8493, + "step": 7 + }, + { + "epoch": 0.01646514021095961, + "grad_norm": 8.56408405303955, + "learning_rate": 1.6494845360824744e-06, + "loss": 2.9573, + "step": 8 + }, + { + "epoch": 0.01852328273732956, + "grad_norm": 6.307392120361328, + "learning_rate": 1.8556701030927837e-06, + "loss": 2.9507, + "step": 9 + }, + { + "epoch": 0.02058142526369951, + "grad_norm": 4.276430130004883, + "learning_rate": 2.061855670103093e-06, + "loss": 2.8988, + "step": 10 + }, + { + "epoch": 0.022639567790069464, + "grad_norm": 2.5912015438079834, + "learning_rate": 2.268041237113402e-06, + "loss": 2.9926, + "step": 11 + }, + { + "epoch": 0.024697710316439414, + "grad_norm": 2.018446207046509, + "learning_rate": 2.4742268041237115e-06, + "loss": 2.9874, + "step": 12 + }, + { + "epoch": 0.026755852842809364, + "grad_norm": 1.8558588027954102, + "learning_rate": 2.680412371134021e-06, + "loss": 2.8608, + "step": 13 + }, + { + "epoch": 0.028813995369179314, + "grad_norm": 1.9658265113830566, + "learning_rate": 2.8865979381443297e-06, + "loss": 2.8596, + "step": 14 + }, + { + "epoch": 0.030872137895549268, + "grad_norm": 1.872044563293457, + "learning_rate": 3.0927835051546395e-06, + "loss": 2.8836, + "step": 15 + }, + { + "epoch": 0.03293028042191922, + "grad_norm": 1.8884096145629883, + "learning_rate": 3.298969072164949e-06, + "loss": 2.9383, + "step": 16 + }, + { + "epoch": 0.03498842294828917, + "grad_norm": 1.8795744180679321, + "learning_rate": 3.5051546391752577e-06, + "loss": 2.883, + "step": 17 + }, + { + "epoch": 0.03704656547465912, + "grad_norm": 1.783678412437439, + "learning_rate": 3.7113402061855674e-06, + "loss": 2.8019, + "step": 18 + }, + { + "epoch": 0.039104708001029075, + "grad_norm": 1.820617914199829, + "learning_rate": 3.917525773195877e-06, + "loss": 2.8813, + "step": 19 + }, + { + "epoch": 0.04116285052739902, + "grad_norm": 1.8188731670379639, + "learning_rate": 4.123711340206186e-06, + "loss": 2.8401, + "step": 20 + }, + { + "epoch": 0.043220993053768975, + "grad_norm": 1.7305251359939575, + "learning_rate": 4.329896907216495e-06, + "loss": 2.7478, + "step": 21 + }, + { + "epoch": 0.04527913558013893, + "grad_norm": 1.7014551162719727, + "learning_rate": 4.536082474226804e-06, + "loss": 2.7356, + "step": 22 + }, + { + "epoch": 0.047337278106508875, + "grad_norm": 1.677381157875061, + "learning_rate": 4.742268041237113e-06, + "loss": 2.7593, + "step": 23 + }, + { + "epoch": 0.04939542063287883, + "grad_norm": 1.628554344177246, + "learning_rate": 4.948453608247423e-06, + "loss": 2.7689, + "step": 24 + }, + { + "epoch": 0.051453563159248775, + "grad_norm": 1.4968128204345703, + "learning_rate": 5.154639175257732e-06, + "loss": 2.6613, + "step": 25 + }, + { + "epoch": 0.05351170568561873, + "grad_norm": 1.4734832048416138, + "learning_rate": 5.360824742268042e-06, + "loss": 2.7095, + "step": 26 + }, + { + "epoch": 0.05556984821198868, + "grad_norm": 1.3745571374893188, + "learning_rate": 5.567010309278351e-06, + "loss": 2.655, + "step": 27 + }, + { + "epoch": 0.05762799073835863, + "grad_norm": 1.3381729125976562, + "learning_rate": 5.7731958762886594e-06, + "loss": 2.55, + "step": 28 + }, + { + "epoch": 0.05968613326472858, + "grad_norm": 1.3388073444366455, + "learning_rate": 5.979381443298969e-06, + "loss": 2.5219, + "step": 29 + }, + { + "epoch": 0.061744275791098535, + "grad_norm": 1.317008376121521, + "learning_rate": 6.185567010309279e-06, + "loss": 2.4491, + "step": 30 + }, + { + "epoch": 0.06380241831746848, + "grad_norm": 1.3210794925689697, + "learning_rate": 6.391752577319588e-06, + "loss": 2.4358, + "step": 31 + }, + { + "epoch": 0.06586056084383844, + "grad_norm": 1.182519555091858, + "learning_rate": 6.597938144329898e-06, + "loss": 2.4514, + "step": 32 + }, + { + "epoch": 0.06791870337020839, + "grad_norm": 1.2238099575042725, + "learning_rate": 6.804123711340207e-06, + "loss": 2.442, + "step": 33 + }, + { + "epoch": 0.06997684589657834, + "grad_norm": 1.1793314218521118, + "learning_rate": 7.010309278350515e-06, + "loss": 2.3864, + "step": 34 + }, + { + "epoch": 0.0720349884229483, + "grad_norm": 1.1983020305633545, + "learning_rate": 7.216494845360825e-06, + "loss": 2.3796, + "step": 35 + }, + { + "epoch": 0.07409313094931824, + "grad_norm": 1.2189652919769287, + "learning_rate": 7.422680412371135e-06, + "loss": 2.4152, + "step": 36 + }, + { + "epoch": 0.07615127347568819, + "grad_norm": 1.14923095703125, + "learning_rate": 7.628865979381444e-06, + "loss": 2.3298, + "step": 37 + }, + { + "epoch": 0.07820941600205815, + "grad_norm": 1.147013545036316, + "learning_rate": 7.835051546391754e-06, + "loss": 2.2488, + "step": 38 + }, + { + "epoch": 0.0802675585284281, + "grad_norm": 1.133981466293335, + "learning_rate": 8.041237113402063e-06, + "loss": 2.1825, + "step": 39 + }, + { + "epoch": 0.08232570105479804, + "grad_norm": 1.1686867475509644, + "learning_rate": 8.247422680412371e-06, + "loss": 2.2282, + "step": 40 + }, + { + "epoch": 0.084383843581168, + "grad_norm": 1.131690502166748, + "learning_rate": 8.453608247422681e-06, + "loss": 2.0962, + "step": 41 + }, + { + "epoch": 0.08644198610753795, + "grad_norm": 1.1626195907592773, + "learning_rate": 8.65979381443299e-06, + "loss": 2.1161, + "step": 42 + }, + { + "epoch": 0.0885001286339079, + "grad_norm": 1.1508581638336182, + "learning_rate": 8.865979381443299e-06, + "loss": 1.9856, + "step": 43 + }, + { + "epoch": 0.09055827116027786, + "grad_norm": 1.2286733388900757, + "learning_rate": 9.072164948453609e-06, + "loss": 2.076, + "step": 44 + }, + { + "epoch": 0.0926164136866478, + "grad_norm": 1.82068932056427, + "learning_rate": 9.278350515463918e-06, + "loss": 1.9995, + "step": 45 + }, + { + "epoch": 0.09467455621301775, + "grad_norm": 2.079101324081421, + "learning_rate": 9.484536082474226e-06, + "loss": 1.9601, + "step": 46 + }, + { + "epoch": 0.0967326987393877, + "grad_norm": 1.1209226846694946, + "learning_rate": 9.690721649484536e-06, + "loss": 1.9346, + "step": 47 + }, + { + "epoch": 0.09879084126575766, + "grad_norm": 1.0579711198806763, + "learning_rate": 9.896907216494846e-06, + "loss": 1.8764, + "step": 48 + }, + { + "epoch": 0.1008489837921276, + "grad_norm": 1.0434011220932007, + "learning_rate": 1.0103092783505156e-05, + "loss": 1.8483, + "step": 49 + }, + { + "epoch": 0.10290712631849755, + "grad_norm": 1.0089991092681885, + "learning_rate": 1.0309278350515464e-05, + "loss": 1.8018, + "step": 50 + }, + { + "epoch": 0.10496526884486751, + "grad_norm": 1.0117324590682983, + "learning_rate": 1.0515463917525775e-05, + "loss": 1.8003, + "step": 51 + }, + { + "epoch": 0.10702341137123746, + "grad_norm": 1.0006697177886963, + "learning_rate": 1.0721649484536083e-05, + "loss": 1.7482, + "step": 52 + }, + { + "epoch": 0.1090815538976074, + "grad_norm": 2.1164329051971436, + "learning_rate": 1.0927835051546391e-05, + "loss": 1.7363, + "step": 53 + }, + { + "epoch": 0.11113969642397736, + "grad_norm": 0.9573502540588379, + "learning_rate": 1.1134020618556703e-05, + "loss": 1.661, + "step": 54 + }, + { + "epoch": 0.11319783895034731, + "grad_norm": 1.0059764385223389, + "learning_rate": 1.134020618556701e-05, + "loss": 1.6979, + "step": 55 + }, + { + "epoch": 0.11525598147671726, + "grad_norm": 0.9719656109809875, + "learning_rate": 1.1546391752577319e-05, + "loss": 1.6318, + "step": 56 + }, + { + "epoch": 0.11731412400308722, + "grad_norm": 1.0024539232254028, + "learning_rate": 1.175257731958763e-05, + "loss": 1.6283, + "step": 57 + }, + { + "epoch": 0.11937226652945716, + "grad_norm": 0.9772456288337708, + "learning_rate": 1.1958762886597938e-05, + "loss": 1.5611, + "step": 58 + }, + { + "epoch": 0.12143040905582711, + "grad_norm": 0.9947625994682312, + "learning_rate": 1.2164948453608248e-05, + "loss": 1.6073, + "step": 59 + }, + { + "epoch": 0.12348855158219707, + "grad_norm": 2.112889051437378, + "learning_rate": 1.2371134020618558e-05, + "loss": 1.6208, + "step": 60 + }, + { + "epoch": 0.12554669410856703, + "grad_norm": 1.0515345335006714, + "learning_rate": 1.2577319587628866e-05, + "loss": 1.569, + "step": 61 + }, + { + "epoch": 0.12760483663493696, + "grad_norm": 1.0782145261764526, + "learning_rate": 1.2783505154639176e-05, + "loss": 1.5097, + "step": 62 + }, + { + "epoch": 0.12966297916130692, + "grad_norm": 1.154104232788086, + "learning_rate": 1.2989690721649485e-05, + "loss": 1.5472, + "step": 63 + }, + { + "epoch": 0.13172112168767688, + "grad_norm": 1.1614656448364258, + "learning_rate": 1.3195876288659795e-05, + "loss": 1.4833, + "step": 64 + }, + { + "epoch": 0.13377926421404682, + "grad_norm": 1.1720911264419556, + "learning_rate": 1.3402061855670103e-05, + "loss": 1.4644, + "step": 65 + }, + { + "epoch": 0.13583740674041678, + "grad_norm": 1.8903896808624268, + "learning_rate": 1.3608247422680415e-05, + "loss": 1.4286, + "step": 66 + }, + { + "epoch": 0.13789554926678674, + "grad_norm": 1.2675013542175293, + "learning_rate": 1.3814432989690723e-05, + "loss": 1.416, + "step": 67 + }, + { + "epoch": 0.13995369179315667, + "grad_norm": 1.266434907913208, + "learning_rate": 1.402061855670103e-05, + "loss": 1.3171, + "step": 68 + }, + { + "epoch": 0.14201183431952663, + "grad_norm": 1.3408889770507812, + "learning_rate": 1.4226804123711342e-05, + "loss": 1.3396, + "step": 69 + }, + { + "epoch": 0.1440699768458966, + "grad_norm": 1.3862446546554565, + "learning_rate": 1.443298969072165e-05, + "loss": 1.2642, + "step": 70 + }, + { + "epoch": 0.14612811937226652, + "grad_norm": 2.110553026199341, + "learning_rate": 1.4639175257731958e-05, + "loss": 1.2593, + "step": 71 + }, + { + "epoch": 0.14818626189863648, + "grad_norm": 1.7017499208450317, + "learning_rate": 1.484536082474227e-05, + "loss": 1.24, + "step": 72 + }, + { + "epoch": 0.15024440442500644, + "grad_norm": 1.9851700067520142, + "learning_rate": 1.5051546391752578e-05, + "loss": 1.2313, + "step": 73 + }, + { + "epoch": 0.15230254695137638, + "grad_norm": 2.009608030319214, + "learning_rate": 1.5257731958762888e-05, + "loss": 1.1281, + "step": 74 + }, + { + "epoch": 0.15436068947774634, + "grad_norm": 2.7587485313415527, + "learning_rate": 1.5463917525773197e-05, + "loss": 1.1248, + "step": 75 + }, + { + "epoch": 0.1564188320041163, + "grad_norm": 2.780954599380493, + "learning_rate": 1.5670103092783507e-05, + "loss": 1.0797, + "step": 76 + }, + { + "epoch": 0.15847697453048623, + "grad_norm": 3.1470866203308105, + "learning_rate": 1.5876288659793813e-05, + "loss": 1.0064, + "step": 77 + }, + { + "epoch": 0.1605351170568562, + "grad_norm": 4.653595447540283, + "learning_rate": 1.6082474226804127e-05, + "loss": 0.9219, + "step": 78 + }, + { + "epoch": 0.16259325958322615, + "grad_norm": 4.157363414764404, + "learning_rate": 1.6288659793814433e-05, + "loss": 0.8709, + "step": 79 + }, + { + "epoch": 0.16465140210959608, + "grad_norm": 4.5814924240112305, + "learning_rate": 1.6494845360824743e-05, + "loss": 0.7693, + "step": 80 + }, + { + "epoch": 0.16670954463596604, + "grad_norm": 5.096139907836914, + "learning_rate": 1.6701030927835052e-05, + "loss": 0.6868, + "step": 81 + }, + { + "epoch": 0.168767687162336, + "grad_norm": 4.858880519866943, + "learning_rate": 1.6907216494845362e-05, + "loss": 0.5971, + "step": 82 + }, + { + "epoch": 0.17082582968870594, + "grad_norm": 4.42564582824707, + "learning_rate": 1.7113402061855672e-05, + "loss": 0.4719, + "step": 83 + }, + { + "epoch": 0.1728839722150759, + "grad_norm": 7.720851421356201, + "learning_rate": 1.731958762886598e-05, + "loss": 0.3943, + "step": 84 + }, + { + "epoch": 0.17494211474144586, + "grad_norm": 0.41923192143440247, + "learning_rate": 1.752577319587629e-05, + "loss": 0.3635, + "step": 85 + }, + { + "epoch": 0.1770002572678158, + "grad_norm": 0.2771846354007721, + "learning_rate": 1.7731958762886598e-05, + "loss": 0.3597, + "step": 86 + }, + { + "epoch": 0.17905839979418575, + "grad_norm": 0.24761857092380524, + "learning_rate": 1.793814432989691e-05, + "loss": 0.3735, + "step": 87 + }, + { + "epoch": 0.1811165423205557, + "grad_norm": 0.23277048766613007, + "learning_rate": 1.8144329896907217e-05, + "loss": 0.3643, + "step": 88 + }, + { + "epoch": 0.18317468484692565, + "grad_norm": 0.22931228578090668, + "learning_rate": 1.8350515463917527e-05, + "loss": 0.3519, + "step": 89 + }, + { + "epoch": 0.1852328273732956, + "grad_norm": 0.20750615000724792, + "learning_rate": 1.8556701030927837e-05, + "loss": 0.3431, + "step": 90 + }, + { + "epoch": 0.18729096989966554, + "grad_norm": 0.2080322951078415, + "learning_rate": 1.8762886597938147e-05, + "loss": 0.3632, + "step": 91 + }, + { + "epoch": 0.1893491124260355, + "grad_norm": 0.20186181366443634, + "learning_rate": 1.8969072164948453e-05, + "loss": 0.3492, + "step": 92 + }, + { + "epoch": 0.19140725495240546, + "grad_norm": 0.19172786176204681, + "learning_rate": 1.9175257731958766e-05, + "loss": 0.3552, + "step": 93 + }, + { + "epoch": 0.1934653974787754, + "grad_norm": 0.1747850626707077, + "learning_rate": 1.9381443298969072e-05, + "loss": 0.3355, + "step": 94 + }, + { + "epoch": 0.19552354000514535, + "grad_norm": 0.196411594748497, + "learning_rate": 1.9587628865979382e-05, + "loss": 0.3271, + "step": 95 + }, + { + "epoch": 0.1975816825315153, + "grad_norm": 0.20063228905200958, + "learning_rate": 1.9793814432989692e-05, + "loss": 0.3351, + "step": 96 + }, + { + "epoch": 0.19963982505788525, + "grad_norm": 0.19240939617156982, + "learning_rate": 2e-05, + "loss": 0.3266, + "step": 97 + }, + { + "epoch": 0.2016979675842552, + "grad_norm": 0.18206572532653809, + "learning_rate": 1.997709049255441e-05, + "loss": 0.3393, + "step": 98 + }, + { + "epoch": 0.20375611011062517, + "grad_norm": 0.20384562015533447, + "learning_rate": 1.9954180985108823e-05, + "loss": 0.3395, + "step": 99 + }, + { + "epoch": 0.2058142526369951, + "grad_norm": 0.19944581389427185, + "learning_rate": 1.9931271477663232e-05, + "loss": 0.3268, + "step": 100 + }, + { + "epoch": 0.2058142526369951, + "eval_loss": 0.3456890285015106, + "eval_runtime": 2114.0178, + "eval_samples_per_second": 3.677, + "eval_steps_per_second": 0.92, + "step": 100 + }, + { + "epoch": 0.20787239516336506, + "grad_norm": 0.17743557691574097, + "learning_rate": 1.990836197021764e-05, + "loss": 0.3439, + "step": 101 + }, + { + "epoch": 0.20993053768973502, + "grad_norm": 0.18746449053287506, + "learning_rate": 1.9885452462772053e-05, + "loss": 0.326, + "step": 102 + }, + { + "epoch": 0.21198868021610495, + "grad_norm": 0.18555815517902374, + "learning_rate": 1.9862542955326462e-05, + "loss": 0.3337, + "step": 103 + }, + { + "epoch": 0.2140468227424749, + "grad_norm": 0.16591575741767883, + "learning_rate": 1.9839633447880874e-05, + "loss": 0.3121, + "step": 104 + }, + { + "epoch": 0.21610496526884487, + "grad_norm": 0.1621987372636795, + "learning_rate": 1.9816723940435283e-05, + "loss": 0.3287, + "step": 105 + }, + { + "epoch": 0.2181631077952148, + "grad_norm": 0.1614532470703125, + "learning_rate": 1.9793814432989692e-05, + "loss": 0.3306, + "step": 106 + }, + { + "epoch": 0.22022125032158477, + "grad_norm": 0.17993387579917908, + "learning_rate": 1.9770904925544104e-05, + "loss": 0.3341, + "step": 107 + }, + { + "epoch": 0.22227939284795473, + "grad_norm": 0.1550011783838272, + "learning_rate": 1.9747995418098513e-05, + "loss": 0.3197, + "step": 108 + }, + { + "epoch": 0.22433753537432466, + "grad_norm": 0.18471524119377136, + "learning_rate": 1.9725085910652922e-05, + "loss": 0.3285, + "step": 109 + }, + { + "epoch": 0.22639567790069462, + "grad_norm": 0.15604373812675476, + "learning_rate": 1.9702176403207334e-05, + "loss": 0.3298, + "step": 110 + }, + { + "epoch": 0.22845382042706458, + "grad_norm": 0.1682298630475998, + "learning_rate": 1.9679266895761743e-05, + "loss": 0.3343, + "step": 111 + }, + { + "epoch": 0.2305119629534345, + "grad_norm": 0.14933635294437408, + "learning_rate": 1.9656357388316152e-05, + "loss": 0.3134, + "step": 112 + }, + { + "epoch": 0.23257010547980447, + "grad_norm": 0.14892347157001495, + "learning_rate": 1.963344788087056e-05, + "loss": 0.3154, + "step": 113 + }, + { + "epoch": 0.23462824800617443, + "grad_norm": 0.1577889323234558, + "learning_rate": 1.9610538373424973e-05, + "loss": 0.3122, + "step": 114 + }, + { + "epoch": 0.23668639053254437, + "grad_norm": 0.16482344269752502, + "learning_rate": 1.9587628865979382e-05, + "loss": 0.3193, + "step": 115 + }, + { + "epoch": 0.23874453305891433, + "grad_norm": 0.15328913927078247, + "learning_rate": 1.956471935853379e-05, + "loss": 0.3217, + "step": 116 + }, + { + "epoch": 0.2408026755852843, + "grad_norm": 0.16140656173229218, + "learning_rate": 1.9541809851088203e-05, + "loss": 0.318, + "step": 117 + }, + { + "epoch": 0.24286081811165422, + "grad_norm": 0.15448373556137085, + "learning_rate": 1.9518900343642612e-05, + "loss": 0.3205, + "step": 118 + }, + { + "epoch": 0.24491896063802418, + "grad_norm": 0.14716887474060059, + "learning_rate": 1.9495990836197025e-05, + "loss": 0.3164, + "step": 119 + }, + { + "epoch": 0.24697710316439414, + "grad_norm": 0.16582027077674866, + "learning_rate": 1.9473081328751433e-05, + "loss": 0.3191, + "step": 120 + }, + { + "epoch": 0.24903524569076407, + "grad_norm": 0.15213699638843536, + "learning_rate": 1.9450171821305842e-05, + "loss": 0.304, + "step": 121 + }, + { + "epoch": 0.25109338821713406, + "grad_norm": 0.1659238487482071, + "learning_rate": 1.9427262313860255e-05, + "loss": 0.3184, + "step": 122 + }, + { + "epoch": 0.253151530743504, + "grad_norm": 0.15596656501293182, + "learning_rate": 1.9404352806414663e-05, + "loss": 0.3092, + "step": 123 + }, + { + "epoch": 0.2552096732698739, + "grad_norm": 0.15868476033210754, + "learning_rate": 1.9381443298969072e-05, + "loss": 0.3163, + "step": 124 + }, + { + "epoch": 0.2572678157962439, + "grad_norm": 0.15386095643043518, + "learning_rate": 1.9358533791523485e-05, + "loss": 0.3049, + "step": 125 + }, + { + "epoch": 0.25932595832261385, + "grad_norm": 0.15179213881492615, + "learning_rate": 1.9335624284077894e-05, + "loss": 0.3131, + "step": 126 + }, + { + "epoch": 0.2613841008489838, + "grad_norm": 0.1595134735107422, + "learning_rate": 1.9312714776632306e-05, + "loss": 0.3069, + "step": 127 + }, + { + "epoch": 0.26344224337535377, + "grad_norm": 0.16989803314208984, + "learning_rate": 1.9289805269186715e-05, + "loss": 0.3052, + "step": 128 + }, + { + "epoch": 0.2655003859017237, + "grad_norm": 0.14803892374038696, + "learning_rate": 1.9266895761741124e-05, + "loss": 0.3065, + "step": 129 + }, + { + "epoch": 0.26755852842809363, + "grad_norm": 0.16676583886146545, + "learning_rate": 1.9243986254295536e-05, + "loss": 0.2962, + "step": 130 + }, + { + "epoch": 0.2696166709544636, + "grad_norm": 0.15694552659988403, + "learning_rate": 1.9221076746849945e-05, + "loss": 0.3096, + "step": 131 + }, + { + "epoch": 0.27167481348083355, + "grad_norm": 0.17696696519851685, + "learning_rate": 1.9198167239404354e-05, + "loss": 0.3145, + "step": 132 + }, + { + "epoch": 0.2737329560072035, + "grad_norm": 0.17204038798809052, + "learning_rate": 1.9175257731958766e-05, + "loss": 0.3248, + "step": 133 + }, + { + "epoch": 0.2757910985335735, + "grad_norm": 0.15630359947681427, + "learning_rate": 1.9152348224513175e-05, + "loss": 0.3117, + "step": 134 + }, + { + "epoch": 0.2778492410599434, + "grad_norm": 0.15757997334003448, + "learning_rate": 1.9129438717067584e-05, + "loss": 0.3145, + "step": 135 + }, + { + "epoch": 0.27990738358631334, + "grad_norm": 0.16273653507232666, + "learning_rate": 1.9106529209621996e-05, + "loss": 0.3159, + "step": 136 + }, + { + "epoch": 0.28196552611268333, + "grad_norm": 0.16213104128837585, + "learning_rate": 1.9083619702176405e-05, + "loss": 0.2949, + "step": 137 + }, + { + "epoch": 0.28402366863905326, + "grad_norm": 0.15377865731716156, + "learning_rate": 1.9060710194730814e-05, + "loss": 0.306, + "step": 138 + }, + { + "epoch": 0.2860818111654232, + "grad_norm": 0.1545962244272232, + "learning_rate": 1.9037800687285223e-05, + "loss": 0.2966, + "step": 139 + }, + { + "epoch": 0.2881399536917932, + "grad_norm": 0.15516617894172668, + "learning_rate": 1.9014891179839635e-05, + "loss": 0.3122, + "step": 140 + }, + { + "epoch": 0.2901980962181631, + "grad_norm": 0.14734458923339844, + "learning_rate": 1.8991981672394044e-05, + "loss": 0.3118, + "step": 141 + }, + { + "epoch": 0.29225623874453305, + "grad_norm": 0.1644304096698761, + "learning_rate": 1.8969072164948453e-05, + "loss": 0.3027, + "step": 142 + }, + { + "epoch": 0.29431438127090304, + "grad_norm": 0.14632569253444672, + "learning_rate": 1.8946162657502865e-05, + "loss": 0.3023, + "step": 143 + }, + { + "epoch": 0.29637252379727297, + "grad_norm": 0.1573137789964676, + "learning_rate": 1.8923253150057274e-05, + "loss": 0.3102, + "step": 144 + }, + { + "epoch": 0.2984306663236429, + "grad_norm": 0.16423144936561584, + "learning_rate": 1.8900343642611686e-05, + "loss": 0.3033, + "step": 145 + }, + { + "epoch": 0.3004888088500129, + "grad_norm": 0.15420907735824585, + "learning_rate": 1.8877434135166095e-05, + "loss": 0.3089, + "step": 146 + }, + { + "epoch": 0.3025469513763828, + "grad_norm": 0.1579178273677826, + "learning_rate": 1.8854524627720504e-05, + "loss": 0.3071, + "step": 147 + }, + { + "epoch": 0.30460509390275275, + "grad_norm": 0.15866397321224213, + "learning_rate": 1.8831615120274916e-05, + "loss": 0.3083, + "step": 148 + }, + { + "epoch": 0.30666323642912274, + "grad_norm": 0.16651487350463867, + "learning_rate": 1.8808705612829325e-05, + "loss": 0.3099, + "step": 149 + }, + { + "epoch": 0.3087213789554927, + "grad_norm": 0.16281908750534058, + "learning_rate": 1.8785796105383734e-05, + "loss": 0.3034, + "step": 150 + }, + { + "epoch": 0.3107795214818626, + "grad_norm": 0.17449837923049927, + "learning_rate": 1.8762886597938147e-05, + "loss": 0.3054, + "step": 151 + }, + { + "epoch": 0.3128376640082326, + "grad_norm": 0.15403546392917633, + "learning_rate": 1.8739977090492555e-05, + "loss": 0.297, + "step": 152 + }, + { + "epoch": 0.31489580653460253, + "grad_norm": 0.1472466140985489, + "learning_rate": 1.8717067583046968e-05, + "loss": 0.2973, + "step": 153 + }, + { + "epoch": 0.31695394906097246, + "grad_norm": 0.16027937829494476, + "learning_rate": 1.8694158075601377e-05, + "loss": 0.3054, + "step": 154 + }, + { + "epoch": 0.31901209158734245, + "grad_norm": 0.17086225748062134, + "learning_rate": 1.8671248568155786e-05, + "loss": 0.307, + "step": 155 + }, + { + "epoch": 0.3210702341137124, + "grad_norm": 0.15930697321891785, + "learning_rate": 1.8648339060710198e-05, + "loss": 0.293, + "step": 156 + }, + { + "epoch": 0.3231283766400823, + "grad_norm": 0.17086376249790192, + "learning_rate": 1.8625429553264607e-05, + "loss": 0.293, + "step": 157 + }, + { + "epoch": 0.3251865191664523, + "grad_norm": 0.15970875322818756, + "learning_rate": 1.8602520045819016e-05, + "loss": 0.3083, + "step": 158 + }, + { + "epoch": 0.32724466169282224, + "grad_norm": 0.16355909407138824, + "learning_rate": 1.8579610538373428e-05, + "loss": 0.3139, + "step": 159 + }, + { + "epoch": 0.32930280421919217, + "grad_norm": 0.15183711051940918, + "learning_rate": 1.8556701030927837e-05, + "loss": 0.2953, + "step": 160 + }, + { + "epoch": 0.33136094674556216, + "grad_norm": 0.15123715996742249, + "learning_rate": 1.853379152348225e-05, + "loss": 0.3025, + "step": 161 + }, + { + "epoch": 0.3334190892719321, + "grad_norm": 0.1576143503189087, + "learning_rate": 1.8510882016036658e-05, + "loss": 0.2904, + "step": 162 + }, + { + "epoch": 0.335477231798302, + "grad_norm": 0.1457504779100418, + "learning_rate": 1.8487972508591067e-05, + "loss": 0.2909, + "step": 163 + }, + { + "epoch": 0.337535374324672, + "grad_norm": 0.1557442992925644, + "learning_rate": 1.846506300114548e-05, + "loss": 0.3027, + "step": 164 + }, + { + "epoch": 0.33959351685104194, + "grad_norm": 0.15662318468093872, + "learning_rate": 1.8442153493699888e-05, + "loss": 0.311, + "step": 165 + }, + { + "epoch": 0.3416516593774119, + "grad_norm": 0.16177058219909668, + "learning_rate": 1.8419243986254297e-05, + "loss": 0.2944, + "step": 166 + }, + { + "epoch": 0.34370980190378186, + "grad_norm": 0.16406729817390442, + "learning_rate": 1.8396334478808706e-05, + "loss": 0.2927, + "step": 167 + }, + { + "epoch": 0.3457679444301518, + "grad_norm": 0.16642791032791138, + "learning_rate": 1.8373424971363115e-05, + "loss": 0.3063, + "step": 168 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.1650693714618683, + "learning_rate": 1.8350515463917527e-05, + "loss": 0.2957, + "step": 169 + }, + { + "epoch": 0.3498842294828917, + "grad_norm": 0.15349675714969635, + "learning_rate": 1.8327605956471936e-05, + "loss": 0.297, + "step": 170 + }, + { + "epoch": 0.35194237200926165, + "grad_norm": 0.17770209908485413, + "learning_rate": 1.8304696449026348e-05, + "loss": 0.3011, + "step": 171 + }, + { + "epoch": 0.3540005145356316, + "grad_norm": 0.1647631675004959, + "learning_rate": 1.8281786941580757e-05, + "loss": 0.2962, + "step": 172 + }, + { + "epoch": 0.35605865706200157, + "grad_norm": 0.1603834480047226, + "learning_rate": 1.8258877434135166e-05, + "loss": 0.2937, + "step": 173 + }, + { + "epoch": 0.3581167995883715, + "grad_norm": 0.16780880093574524, + "learning_rate": 1.8235967926689578e-05, + "loss": 0.2997, + "step": 174 + }, + { + "epoch": 0.36017494211474144, + "grad_norm": 0.15976767241954803, + "learning_rate": 1.8213058419243987e-05, + "loss": 0.3043, + "step": 175 + }, + { + "epoch": 0.3622330846411114, + "grad_norm": 0.16236485540866852, + "learning_rate": 1.8190148911798396e-05, + "loss": 0.3069, + "step": 176 + }, + { + "epoch": 0.36429122716748136, + "grad_norm": 0.16391968727111816, + "learning_rate": 1.816723940435281e-05, + "loss": 0.2923, + "step": 177 + }, + { + "epoch": 0.3663493696938513, + "grad_norm": 0.15806889533996582, + "learning_rate": 1.8144329896907217e-05, + "loss": 0.2872, + "step": 178 + }, + { + "epoch": 0.3684075122202212, + "grad_norm": 0.1627352088689804, + "learning_rate": 1.812142038946163e-05, + "loss": 0.3032, + "step": 179 + }, + { + "epoch": 0.3704656547465912, + "grad_norm": 0.15103371441364288, + "learning_rate": 1.809851088201604e-05, + "loss": 0.2847, + "step": 180 + }, + { + "epoch": 0.37252379727296114, + "grad_norm": 0.15178488194942474, + "learning_rate": 1.8075601374570447e-05, + "loss": 0.3017, + "step": 181 + }, + { + "epoch": 0.3745819397993311, + "grad_norm": 0.15493899583816528, + "learning_rate": 1.805269186712486e-05, + "loss": 0.2901, + "step": 182 + }, + { + "epoch": 0.37664008232570106, + "grad_norm": 0.15990686416625977, + "learning_rate": 1.802978235967927e-05, + "loss": 0.2861, + "step": 183 + }, + { + "epoch": 0.378698224852071, + "grad_norm": 0.15824148058891296, + "learning_rate": 1.8006872852233677e-05, + "loss": 0.2885, + "step": 184 + }, + { + "epoch": 0.38075636737844093, + "grad_norm": 0.15690775215625763, + "learning_rate": 1.798396334478809e-05, + "loss": 0.2814, + "step": 185 + }, + { + "epoch": 0.3828145099048109, + "grad_norm": 0.15833796560764313, + "learning_rate": 1.79610538373425e-05, + "loss": 0.2847, + "step": 186 + }, + { + "epoch": 0.38487265243118085, + "grad_norm": 0.16560044884681702, + "learning_rate": 1.793814432989691e-05, + "loss": 0.3061, + "step": 187 + }, + { + "epoch": 0.3869307949575508, + "grad_norm": 0.16240179538726807, + "learning_rate": 1.791523482245132e-05, + "loss": 0.2943, + "step": 188 + }, + { + "epoch": 0.38898893748392077, + "grad_norm": 0.15825721621513367, + "learning_rate": 1.789232531500573e-05, + "loss": 0.2934, + "step": 189 + }, + { + "epoch": 0.3910470800102907, + "grad_norm": 0.16665388643741608, + "learning_rate": 1.786941580756014e-05, + "loss": 0.291, + "step": 190 + }, + { + "epoch": 0.39310522253666064, + "grad_norm": 0.16581200063228607, + "learning_rate": 1.784650630011455e-05, + "loss": 0.2849, + "step": 191 + }, + { + "epoch": 0.3951633650630306, + "grad_norm": 0.1604345291852951, + "learning_rate": 1.782359679266896e-05, + "loss": 0.3, + "step": 192 + }, + { + "epoch": 0.39722150758940056, + "grad_norm": 0.16107915341854095, + "learning_rate": 1.7800687285223368e-05, + "loss": 0.2847, + "step": 193 + }, + { + "epoch": 0.3992796501157705, + "grad_norm": 0.1571730375289917, + "learning_rate": 1.7777777777777777e-05, + "loss": 0.2863, + "step": 194 + }, + { + "epoch": 0.4013377926421405, + "grad_norm": 0.1656399518251419, + "learning_rate": 1.775486827033219e-05, + "loss": 0.2878, + "step": 195 + }, + { + "epoch": 0.4033959351685104, + "grad_norm": 0.16738460958003998, + "learning_rate": 1.7731958762886598e-05, + "loss": 0.286, + "step": 196 + }, + { + "epoch": 0.40545407769488034, + "grad_norm": 0.16704292595386505, + "learning_rate": 1.770904925544101e-05, + "loss": 0.2919, + "step": 197 + }, + { + "epoch": 0.40751222022125033, + "grad_norm": 0.16215579211711884, + "learning_rate": 1.768613974799542e-05, + "loss": 0.2874, + "step": 198 + }, + { + "epoch": 0.40957036274762026, + "grad_norm": 0.15573479235172272, + "learning_rate": 1.7663230240549828e-05, + "loss": 0.2904, + "step": 199 + }, + { + "epoch": 0.4116285052739902, + "grad_norm": 0.1707623153924942, + "learning_rate": 1.764032073310424e-05, + "loss": 0.289, + "step": 200 + }, + { + "epoch": 0.4116285052739902, + "eval_loss": 0.3214050829410553, + "eval_runtime": 2449.7742, + "eval_samples_per_second": 3.173, + "eval_steps_per_second": 0.794, + "step": 200 + }, + { + "epoch": 0.4136866478003602, + "grad_norm": 0.1699172556400299, + "learning_rate": 1.761741122565865e-05, + "loss": 0.2852, + "step": 201 + }, + { + "epoch": 0.4157447903267301, + "grad_norm": 0.19150058925151825, + "learning_rate": 1.7594501718213058e-05, + "loss": 0.29, + "step": 202 + }, + { + "epoch": 0.41780293285310005, + "grad_norm": 0.15794627368450165, + "learning_rate": 1.757159221076747e-05, + "loss": 0.2746, + "step": 203 + }, + { + "epoch": 0.41986107537947004, + "grad_norm": 0.17305190861225128, + "learning_rate": 1.754868270332188e-05, + "loss": 0.3003, + "step": 204 + }, + { + "epoch": 0.42191921790583997, + "grad_norm": 0.16257523000240326, + "learning_rate": 1.752577319587629e-05, + "loss": 0.2789, + "step": 205 + }, + { + "epoch": 0.4239773604322099, + "grad_norm": 0.17273619771003723, + "learning_rate": 1.75028636884307e-05, + "loss": 0.2917, + "step": 206 + }, + { + "epoch": 0.4260355029585799, + "grad_norm": 0.17502790689468384, + "learning_rate": 1.747995418098511e-05, + "loss": 0.2992, + "step": 207 + }, + { + "epoch": 0.4280936454849498, + "grad_norm": 0.16464050114154816, + "learning_rate": 1.745704467353952e-05, + "loss": 0.2873, + "step": 208 + }, + { + "epoch": 0.43015178801131976, + "grad_norm": 0.1681668758392334, + "learning_rate": 1.743413516609393e-05, + "loss": 0.2991, + "step": 209 + }, + { + "epoch": 0.43220993053768975, + "grad_norm": 0.16957956552505493, + "learning_rate": 1.741122565864834e-05, + "loss": 0.2868, + "step": 210 + }, + { + "epoch": 0.4342680730640597, + "grad_norm": 0.15875883400440216, + "learning_rate": 1.738831615120275e-05, + "loss": 0.2946, + "step": 211 + }, + { + "epoch": 0.4363262155904296, + "grad_norm": 0.18127889931201935, + "learning_rate": 1.736540664375716e-05, + "loss": 0.2835, + "step": 212 + }, + { + "epoch": 0.4383843581167996, + "grad_norm": 0.17822811007499695, + "learning_rate": 1.7342497136311573e-05, + "loss": 0.2944, + "step": 213 + }, + { + "epoch": 0.44044250064316953, + "grad_norm": 0.17555806040763855, + "learning_rate": 1.731958762886598e-05, + "loss": 0.3001, + "step": 214 + }, + { + "epoch": 0.44250064316953946, + "grad_norm": 0.18709121644496918, + "learning_rate": 1.729667812142039e-05, + "loss": 0.282, + "step": 215 + }, + { + "epoch": 0.44455878569590945, + "grad_norm": 0.16322475671768188, + "learning_rate": 1.7273768613974803e-05, + "loss": 0.2883, + "step": 216 + }, + { + "epoch": 0.4466169282222794, + "grad_norm": 0.1677054911851883, + "learning_rate": 1.7250859106529212e-05, + "loss": 0.28, + "step": 217 + }, + { + "epoch": 0.4486750707486493, + "grad_norm": 0.15764063596725464, + "learning_rate": 1.722794959908362e-05, + "loss": 0.2768, + "step": 218 + }, + { + "epoch": 0.4507332132750193, + "grad_norm": 0.16166841983795166, + "learning_rate": 1.7205040091638033e-05, + "loss": 0.2868, + "step": 219 + }, + { + "epoch": 0.45279135580138924, + "grad_norm": 0.1799350380897522, + "learning_rate": 1.7182130584192442e-05, + "loss": 0.2891, + "step": 220 + }, + { + "epoch": 0.45484949832775917, + "grad_norm": 0.18119174242019653, + "learning_rate": 1.715922107674685e-05, + "loss": 0.2841, + "step": 221 + }, + { + "epoch": 0.45690764085412916, + "grad_norm": 0.17725548148155212, + "learning_rate": 1.713631156930126e-05, + "loss": 0.3038, + "step": 222 + }, + { + "epoch": 0.4589657833804991, + "grad_norm": 0.1628233790397644, + "learning_rate": 1.7113402061855672e-05, + "loss": 0.2868, + "step": 223 + }, + { + "epoch": 0.461023925906869, + "grad_norm": 0.1745166927576065, + "learning_rate": 1.709049255441008e-05, + "loss": 0.3033, + "step": 224 + }, + { + "epoch": 0.463082068433239, + "grad_norm": 0.17708267271518707, + "learning_rate": 1.706758304696449e-05, + "loss": 0.2842, + "step": 225 + }, + { + "epoch": 0.46514021095960895, + "grad_norm": 0.1738453358411789, + "learning_rate": 1.7044673539518902e-05, + "loss": 0.3005, + "step": 226 + }, + { + "epoch": 0.4671983534859789, + "grad_norm": 0.1706874966621399, + "learning_rate": 1.702176403207331e-05, + "loss": 0.2924, + "step": 227 + }, + { + "epoch": 0.46925649601234887, + "grad_norm": 0.1697423756122589, + "learning_rate": 1.699885452462772e-05, + "loss": 0.2783, + "step": 228 + }, + { + "epoch": 0.4713146385387188, + "grad_norm": 0.1783403754234314, + "learning_rate": 1.6975945017182132e-05, + "loss": 0.2924, + "step": 229 + }, + { + "epoch": 0.47337278106508873, + "grad_norm": 0.17431536316871643, + "learning_rate": 1.695303550973654e-05, + "loss": 0.2792, + "step": 230 + }, + { + "epoch": 0.4754309235914587, + "grad_norm": 0.164026141166687, + "learning_rate": 1.6930126002290953e-05, + "loss": 0.2825, + "step": 231 + }, + { + "epoch": 0.47748906611782865, + "grad_norm": 0.16449657082557678, + "learning_rate": 1.6907216494845362e-05, + "loss": 0.2831, + "step": 232 + }, + { + "epoch": 0.4795472086441986, + "grad_norm": 0.1812741607427597, + "learning_rate": 1.688430698739977e-05, + "loss": 0.2849, + "step": 233 + }, + { + "epoch": 0.4816053511705686, + "grad_norm": 0.18431834876537323, + "learning_rate": 1.6861397479954183e-05, + "loss": 0.2802, + "step": 234 + }, + { + "epoch": 0.4836634936969385, + "grad_norm": 0.18349015712738037, + "learning_rate": 1.6838487972508592e-05, + "loss": 0.2804, + "step": 235 + }, + { + "epoch": 0.48572163622330844, + "grad_norm": 0.1769968420267105, + "learning_rate": 1.6815578465063e-05, + "loss": 0.2777, + "step": 236 + }, + { + "epoch": 0.4877797787496784, + "grad_norm": 0.17207500338554382, + "learning_rate": 1.6792668957617413e-05, + "loss": 0.2883, + "step": 237 + }, + { + "epoch": 0.48983792127604836, + "grad_norm": 0.1729692667722702, + "learning_rate": 1.6769759450171822e-05, + "loss": 0.2784, + "step": 238 + }, + { + "epoch": 0.4918960638024183, + "grad_norm": 0.17234881222248077, + "learning_rate": 1.6746849942726235e-05, + "loss": 0.2816, + "step": 239 + }, + { + "epoch": 0.4939542063287883, + "grad_norm": 0.17132551968097687, + "learning_rate": 1.6723940435280644e-05, + "loss": 0.2812, + "step": 240 + }, + { + "epoch": 0.4960123488551582, + "grad_norm": 0.1752254068851471, + "learning_rate": 1.6701030927835052e-05, + "loss": 0.2799, + "step": 241 + }, + { + "epoch": 0.49807049138152815, + "grad_norm": 0.1768665313720703, + "learning_rate": 1.6678121420389465e-05, + "loss": 0.2966, + "step": 242 + }, + { + "epoch": 0.5001286339078981, + "grad_norm": 0.18139514327049255, + "learning_rate": 1.6655211912943874e-05, + "loss": 0.2816, + "step": 243 + }, + { + "epoch": 0.5021867764342681, + "grad_norm": 0.17312943935394287, + "learning_rate": 1.6632302405498283e-05, + "loss": 0.2845, + "step": 244 + }, + { + "epoch": 0.5042449189606381, + "grad_norm": 0.17966389656066895, + "learning_rate": 1.6609392898052695e-05, + "loss": 0.2864, + "step": 245 + }, + { + "epoch": 0.506303061487008, + "grad_norm": 0.16653811931610107, + "learning_rate": 1.6586483390607104e-05, + "loss": 0.2759, + "step": 246 + }, + { + "epoch": 0.5083612040133779, + "grad_norm": 0.1634613424539566, + "learning_rate": 1.6563573883161516e-05, + "loss": 0.2728, + "step": 247 + }, + { + "epoch": 0.5104193465397479, + "grad_norm": 0.17358507215976715, + "learning_rate": 1.654066437571592e-05, + "loss": 0.2706, + "step": 248 + }, + { + "epoch": 0.5124774890661178, + "grad_norm": 0.17524316906929016, + "learning_rate": 1.6517754868270334e-05, + "loss": 0.2805, + "step": 249 + }, + { + "epoch": 0.5145356315924878, + "grad_norm": 0.18134094774723053, + "learning_rate": 1.6494845360824743e-05, + "loss": 0.2909, + "step": 250 + }, + { + "epoch": 0.5165937741188578, + "grad_norm": 0.17795510590076447, + "learning_rate": 1.647193585337915e-05, + "loss": 0.2889, + "step": 251 + }, + { + "epoch": 0.5186519166452277, + "grad_norm": 0.16782547533512115, + "learning_rate": 1.6449026345933564e-05, + "loss": 0.2842, + "step": 252 + }, + { + "epoch": 0.5207100591715976, + "grad_norm": 0.17360062897205353, + "learning_rate": 1.6426116838487973e-05, + "loss": 0.2763, + "step": 253 + }, + { + "epoch": 0.5227682016979676, + "grad_norm": 0.17241406440734863, + "learning_rate": 1.6403207331042385e-05, + "loss": 0.2753, + "step": 254 + }, + { + "epoch": 0.5248263442243375, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.6380297823596794e-05, + "loss": 0.2732, + "step": 255 + }, + { + "epoch": 0.5268844867507075, + "grad_norm": 0.1807374209165573, + "learning_rate": 1.6357388316151203e-05, + "loss": 0.2856, + "step": 256 + }, + { + "epoch": 0.5289426292770775, + "grad_norm": 0.1749904304742813, + "learning_rate": 1.6334478808705615e-05, + "loss": 0.285, + "step": 257 + }, + { + "epoch": 0.5310007718034474, + "grad_norm": 0.16673170030117035, + "learning_rate": 1.6311569301260024e-05, + "loss": 0.2825, + "step": 258 + }, + { + "epoch": 0.5330589143298173, + "grad_norm": 0.17239685356616974, + "learning_rate": 1.6288659793814433e-05, + "loss": 0.2845, + "step": 259 + }, + { + "epoch": 0.5351170568561873, + "grad_norm": 0.1831504851579666, + "learning_rate": 1.6265750286368845e-05, + "loss": 0.2859, + "step": 260 + }, + { + "epoch": 0.5371751993825572, + "grad_norm": 0.18507827818393707, + "learning_rate": 1.6242840778923254e-05, + "loss": 0.293, + "step": 261 + }, + { + "epoch": 0.5392333419089272, + "grad_norm": 0.16738134622573853, + "learning_rate": 1.6219931271477663e-05, + "loss": 0.2853, + "step": 262 + }, + { + "epoch": 0.5412914844352972, + "grad_norm": 0.1701226830482483, + "learning_rate": 1.6197021764032075e-05, + "loss": 0.2763, + "step": 263 + }, + { + "epoch": 0.5433496269616671, + "grad_norm": 0.18195705115795135, + "learning_rate": 1.6174112256586484e-05, + "loss": 0.2797, + "step": 264 + }, + { + "epoch": 0.545407769488037, + "grad_norm": 0.1832309514284134, + "learning_rate": 1.6151202749140896e-05, + "loss": 0.2885, + "step": 265 + }, + { + "epoch": 0.547465912014407, + "grad_norm": 0.1773810088634491, + "learning_rate": 1.6128293241695305e-05, + "loss": 0.2682, + "step": 266 + }, + { + "epoch": 0.5495240545407769, + "grad_norm": 0.16989603638648987, + "learning_rate": 1.6105383734249714e-05, + "loss": 0.2821, + "step": 267 + }, + { + "epoch": 0.551582197067147, + "grad_norm": 0.17835170030593872, + "learning_rate": 1.6082474226804127e-05, + "loss": 0.2774, + "step": 268 + }, + { + "epoch": 0.5536403395935169, + "grad_norm": 0.1777082234621048, + "learning_rate": 1.6059564719358535e-05, + "loss": 0.2726, + "step": 269 + }, + { + "epoch": 0.5556984821198868, + "grad_norm": 0.18766450881958008, + "learning_rate": 1.6036655211912944e-05, + "loss": 0.2879, + "step": 270 + }, + { + "epoch": 0.5577566246462567, + "grad_norm": 0.1868186593055725, + "learning_rate": 1.6013745704467357e-05, + "loss": 0.2808, + "step": 271 + }, + { + "epoch": 0.5598147671726267, + "grad_norm": 0.16695882380008698, + "learning_rate": 1.5990836197021766e-05, + "loss": 0.2668, + "step": 272 + }, + { + "epoch": 0.5618729096989966, + "grad_norm": 0.17224495112895966, + "learning_rate": 1.5967926689576178e-05, + "loss": 0.2682, + "step": 273 + }, + { + "epoch": 0.5639310522253667, + "grad_norm": 0.20116423070430756, + "learning_rate": 1.5945017182130587e-05, + "loss": 0.276, + "step": 274 + }, + { + "epoch": 0.5659891947517366, + "grad_norm": 0.19478343427181244, + "learning_rate": 1.5922107674684996e-05, + "loss": 0.2854, + "step": 275 + }, + { + "epoch": 0.5680473372781065, + "grad_norm": 0.20242950320243835, + "learning_rate": 1.5899198167239405e-05, + "loss": 0.2854, + "step": 276 + }, + { + "epoch": 0.5701054798044765, + "grad_norm": 0.19146093726158142, + "learning_rate": 1.5876288659793813e-05, + "loss": 0.2817, + "step": 277 + }, + { + "epoch": 0.5721636223308464, + "grad_norm": 0.1804896742105484, + "learning_rate": 1.5853379152348226e-05, + "loss": 0.2714, + "step": 278 + }, + { + "epoch": 0.5742217648572163, + "grad_norm": 0.19315646588802338, + "learning_rate": 1.5830469644902635e-05, + "loss": 0.2703, + "step": 279 + }, + { + "epoch": 0.5762799073835864, + "grad_norm": 0.1910266876220703, + "learning_rate": 1.5807560137457047e-05, + "loss": 0.2728, + "step": 280 + }, + { + "epoch": 0.5783380499099563, + "grad_norm": 0.20330773293972015, + "learning_rate": 1.5784650630011456e-05, + "loss": 0.2717, + "step": 281 + }, + { + "epoch": 0.5803961924363262, + "grad_norm": 0.19080683588981628, + "learning_rate": 1.5761741122565865e-05, + "loss": 0.2679, + "step": 282 + }, + { + "epoch": 0.5824543349626962, + "grad_norm": 0.18052135407924652, + "learning_rate": 1.5738831615120277e-05, + "loss": 0.2815, + "step": 283 + }, + { + "epoch": 0.5845124774890661, + "grad_norm": 0.1998361051082611, + "learning_rate": 1.5715922107674686e-05, + "loss": 0.2888, + "step": 284 + }, + { + "epoch": 0.586570620015436, + "grad_norm": 0.1978764683008194, + "learning_rate": 1.5693012600229095e-05, + "loss": 0.2926, + "step": 285 + }, + { + "epoch": 0.5886287625418061, + "grad_norm": 0.17189203202724457, + "learning_rate": 1.5670103092783507e-05, + "loss": 0.2674, + "step": 286 + }, + { + "epoch": 0.590686905068176, + "grad_norm": 0.1937166303396225, + "learning_rate": 1.5647193585337916e-05, + "loss": 0.2838, + "step": 287 + }, + { + "epoch": 0.5927450475945459, + "grad_norm": 0.18978627026081085, + "learning_rate": 1.5624284077892328e-05, + "loss": 0.273, + "step": 288 + }, + { + "epoch": 0.5948031901209159, + "grad_norm": 0.17718705534934998, + "learning_rate": 1.5601374570446737e-05, + "loss": 0.2842, + "step": 289 + }, + { + "epoch": 0.5968613326472858, + "grad_norm": 0.1912536770105362, + "learning_rate": 1.5578465063001146e-05, + "loss": 0.2736, + "step": 290 + }, + { + "epoch": 0.5989194751736557, + "grad_norm": 0.18104907870292664, + "learning_rate": 1.555555555555556e-05, + "loss": 0.274, + "step": 291 + }, + { + "epoch": 0.6009776177000258, + "grad_norm": 0.1620381772518158, + "learning_rate": 1.5532646048109967e-05, + "loss": 0.2663, + "step": 292 + }, + { + "epoch": 0.6030357602263957, + "grad_norm": 0.17973916232585907, + "learning_rate": 1.5509736540664376e-05, + "loss": 0.2791, + "step": 293 + }, + { + "epoch": 0.6050939027527656, + "grad_norm": 0.16821186244487762, + "learning_rate": 1.548682703321879e-05, + "loss": 0.2787, + "step": 294 + }, + { + "epoch": 0.6071520452791356, + "grad_norm": 0.18426693975925446, + "learning_rate": 1.5463917525773197e-05, + "loss": 0.2886, + "step": 295 + }, + { + "epoch": 0.6092101878055055, + "grad_norm": 0.19796033203601837, + "learning_rate": 1.5441008018327606e-05, + "loss": 0.268, + "step": 296 + }, + { + "epoch": 0.6112683303318754, + "grad_norm": 0.1971343755722046, + "learning_rate": 1.541809851088202e-05, + "loss": 0.2761, + "step": 297 + }, + { + "epoch": 0.6133264728582455, + "grad_norm": 0.17458567023277283, + "learning_rate": 1.5395189003436427e-05, + "loss": 0.2831, + "step": 298 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.17610400915145874, + "learning_rate": 1.537227949599084e-05, + "loss": 0.2691, + "step": 299 + }, + { + "epoch": 0.6174427579109854, + "grad_norm": 0.1929042488336563, + "learning_rate": 1.534936998854525e-05, + "loss": 0.2847, + "step": 300 + }, + { + "epoch": 0.6174427579109854, + "eval_loss": 0.2959522604942322, + "eval_runtime": 2428.6339, + "eval_samples_per_second": 3.201, + "eval_steps_per_second": 0.8, + "step": 300 + }, + { + "epoch": 0.6195009004373553, + "grad_norm": 0.19430233538150787, + "learning_rate": 1.5326460481099657e-05, + "loss": 0.279, + "step": 301 + }, + { + "epoch": 0.6215590429637252, + "grad_norm": 0.18542642891407013, + "learning_rate": 1.5303550973654066e-05, + "loss": 0.2695, + "step": 302 + }, + { + "epoch": 0.6236171854900951, + "grad_norm": 0.1850169450044632, + "learning_rate": 1.5280641466208475e-05, + "loss": 0.2847, + "step": 303 + }, + { + "epoch": 0.6256753280164652, + "grad_norm": 0.18449267745018005, + "learning_rate": 1.5257731958762888e-05, + "loss": 0.2804, + "step": 304 + }, + { + "epoch": 0.6277334705428351, + "grad_norm": 0.18608458340168, + "learning_rate": 1.5234822451317296e-05, + "loss": 0.2792, + "step": 305 + }, + { + "epoch": 0.6297916130692051, + "grad_norm": 0.21136076748371124, + "learning_rate": 1.5211912943871707e-05, + "loss": 0.2829, + "step": 306 + }, + { + "epoch": 0.631849755595575, + "grad_norm": 0.19672206044197083, + "learning_rate": 1.5189003436426118e-05, + "loss": 0.2854, + "step": 307 + }, + { + "epoch": 0.6339078981219449, + "grad_norm": 0.1834034025669098, + "learning_rate": 1.5166093928980528e-05, + "loss": 0.2775, + "step": 308 + }, + { + "epoch": 0.6359660406483149, + "grad_norm": 0.18414819240570068, + "learning_rate": 1.5143184421534937e-05, + "loss": 0.2794, + "step": 309 + }, + { + "epoch": 0.6380241831746849, + "grad_norm": 0.1890152245759964, + "learning_rate": 1.5120274914089348e-05, + "loss": 0.2718, + "step": 310 + }, + { + "epoch": 0.6400823257010548, + "grad_norm": 0.18923887610435486, + "learning_rate": 1.5097365406643758e-05, + "loss": 0.2795, + "step": 311 + }, + { + "epoch": 0.6421404682274248, + "grad_norm": 0.20047079026699066, + "learning_rate": 1.5074455899198169e-05, + "loss": 0.2811, + "step": 312 + }, + { + "epoch": 0.6441986107537947, + "grad_norm": 0.1910201609134674, + "learning_rate": 1.5051546391752578e-05, + "loss": 0.2732, + "step": 313 + }, + { + "epoch": 0.6462567532801646, + "grad_norm": 0.2021956443786621, + "learning_rate": 1.5028636884306988e-05, + "loss": 0.2806, + "step": 314 + }, + { + "epoch": 0.6483148958065346, + "grad_norm": 0.18957914412021637, + "learning_rate": 1.5005727376861399e-05, + "loss": 0.2681, + "step": 315 + }, + { + "epoch": 0.6503730383329046, + "grad_norm": 0.19858811795711517, + "learning_rate": 1.498281786941581e-05, + "loss": 0.2805, + "step": 316 + }, + { + "epoch": 0.6524311808592745, + "grad_norm": 0.1731935292482376, + "learning_rate": 1.4959908361970218e-05, + "loss": 0.2646, + "step": 317 + }, + { + "epoch": 0.6544893233856445, + "grad_norm": 0.19619058072566986, + "learning_rate": 1.4936998854524629e-05, + "loss": 0.2965, + "step": 318 + }, + { + "epoch": 0.6565474659120144, + "grad_norm": 0.18745696544647217, + "learning_rate": 1.491408934707904e-05, + "loss": 0.2766, + "step": 319 + }, + { + "epoch": 0.6586056084383843, + "grad_norm": 0.18006449937820435, + "learning_rate": 1.489117983963345e-05, + "loss": 0.2788, + "step": 320 + }, + { + "epoch": 0.6606637509647543, + "grad_norm": 0.17593689262866974, + "learning_rate": 1.486827033218786e-05, + "loss": 0.2813, + "step": 321 + }, + { + "epoch": 0.6627218934911243, + "grad_norm": 0.18695640563964844, + "learning_rate": 1.484536082474227e-05, + "loss": 0.281, + "step": 322 + }, + { + "epoch": 0.6647800360174942, + "grad_norm": 0.17909488081932068, + "learning_rate": 1.482245131729668e-05, + "loss": 0.2814, + "step": 323 + }, + { + "epoch": 0.6668381785438642, + "grad_norm": 0.19074076414108276, + "learning_rate": 1.4799541809851091e-05, + "loss": 0.2721, + "step": 324 + }, + { + "epoch": 0.6688963210702341, + "grad_norm": 0.19175754487514496, + "learning_rate": 1.47766323024055e-05, + "loss": 0.2754, + "step": 325 + }, + { + "epoch": 0.670954463596604, + "grad_norm": 0.18646575510501862, + "learning_rate": 1.475372279495991e-05, + "loss": 0.2678, + "step": 326 + }, + { + "epoch": 0.673012606122974, + "grad_norm": 0.18553243577480316, + "learning_rate": 1.4730813287514321e-05, + "loss": 0.281, + "step": 327 + }, + { + "epoch": 0.675070748649344, + "grad_norm": 0.17120976746082306, + "learning_rate": 1.470790378006873e-05, + "loss": 0.2691, + "step": 328 + }, + { + "epoch": 0.677128891175714, + "grad_norm": 0.19170524179935455, + "learning_rate": 1.4684994272623139e-05, + "loss": 0.2685, + "step": 329 + }, + { + "epoch": 0.6791870337020839, + "grad_norm": 0.1851339191198349, + "learning_rate": 1.466208476517755e-05, + "loss": 0.266, + "step": 330 + }, + { + "epoch": 0.6812451762284538, + "grad_norm": 0.1678062081336975, + "learning_rate": 1.4639175257731958e-05, + "loss": 0.2609, + "step": 331 + }, + { + "epoch": 0.6833033187548238, + "grad_norm": 0.17913252115249634, + "learning_rate": 1.4616265750286369e-05, + "loss": 0.2716, + "step": 332 + }, + { + "epoch": 0.6853614612811937, + "grad_norm": 0.1859239637851715, + "learning_rate": 1.459335624284078e-05, + "loss": 0.2712, + "step": 333 + }, + { + "epoch": 0.6874196038075637, + "grad_norm": 0.18390226364135742, + "learning_rate": 1.457044673539519e-05, + "loss": 0.2827, + "step": 334 + }, + { + "epoch": 0.6894777463339337, + "grad_norm": 0.18520398437976837, + "learning_rate": 1.4547537227949599e-05, + "loss": 0.2721, + "step": 335 + }, + { + "epoch": 0.6915358888603036, + "grad_norm": 0.18416717648506165, + "learning_rate": 1.452462772050401e-05, + "loss": 0.2683, + "step": 336 + }, + { + "epoch": 0.6935940313866735, + "grad_norm": 0.18727894127368927, + "learning_rate": 1.450171821305842e-05, + "loss": 0.2733, + "step": 337 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.18597093224525452, + "learning_rate": 1.447880870561283e-05, + "loss": 0.2708, + "step": 338 + }, + { + "epoch": 0.6977103164394134, + "grad_norm": 0.1786068081855774, + "learning_rate": 1.445589919816724e-05, + "loss": 0.2667, + "step": 339 + }, + { + "epoch": 0.6997684589657834, + "grad_norm": 0.17466600239276886, + "learning_rate": 1.443298969072165e-05, + "loss": 0.2786, + "step": 340 + }, + { + "epoch": 0.7018266014921534, + "grad_norm": 0.185857355594635, + "learning_rate": 1.4410080183276061e-05, + "loss": 0.2759, + "step": 341 + }, + { + "epoch": 0.7038847440185233, + "grad_norm": 0.2004527747631073, + "learning_rate": 1.4387170675830471e-05, + "loss": 0.2847, + "step": 342 + }, + { + "epoch": 0.7059428865448932, + "grad_norm": 0.18774060904979706, + "learning_rate": 1.436426116838488e-05, + "loss": 0.2766, + "step": 343 + }, + { + "epoch": 0.7080010290712632, + "grad_norm": 0.1840328425168991, + "learning_rate": 1.4341351660939291e-05, + "loss": 0.2722, + "step": 344 + }, + { + "epoch": 0.7100591715976331, + "grad_norm": 0.19089624285697937, + "learning_rate": 1.4318442153493702e-05, + "loss": 0.2779, + "step": 345 + }, + { + "epoch": 0.7121173141240031, + "grad_norm": 0.1848018616437912, + "learning_rate": 1.4295532646048112e-05, + "loss": 0.2739, + "step": 346 + }, + { + "epoch": 0.7141754566503731, + "grad_norm": 0.18844038248062134, + "learning_rate": 1.4272623138602521e-05, + "loss": 0.27, + "step": 347 + }, + { + "epoch": 0.716233599176743, + "grad_norm": 0.19289302825927734, + "learning_rate": 1.4249713631156932e-05, + "loss": 0.2743, + "step": 348 + }, + { + "epoch": 0.7182917417031129, + "grad_norm": 0.18738920986652374, + "learning_rate": 1.4226804123711342e-05, + "loss": 0.2657, + "step": 349 + }, + { + "epoch": 0.7203498842294829, + "grad_norm": 0.1925181746482849, + "learning_rate": 1.4203894616265753e-05, + "loss": 0.2637, + "step": 350 + }, + { + "epoch": 0.7224080267558528, + "grad_norm": 0.19114750623703003, + "learning_rate": 1.4180985108820162e-05, + "loss": 0.2758, + "step": 351 + }, + { + "epoch": 0.7244661692822228, + "grad_norm": 0.18310120701789856, + "learning_rate": 1.4158075601374572e-05, + "loss": 0.2777, + "step": 352 + }, + { + "epoch": 0.7265243118085928, + "grad_norm": 0.2045605331659317, + "learning_rate": 1.4135166093928983e-05, + "loss": 0.2653, + "step": 353 + }, + { + "epoch": 0.7285824543349627, + "grad_norm": 0.1856454759836197, + "learning_rate": 1.4112256586483393e-05, + "loss": 0.267, + "step": 354 + }, + { + "epoch": 0.7306405968613326, + "grad_norm": 0.1855366826057434, + "learning_rate": 1.4089347079037802e-05, + "loss": 0.2805, + "step": 355 + }, + { + "epoch": 0.7326987393877026, + "grad_norm": 0.17913414537906647, + "learning_rate": 1.4066437571592213e-05, + "loss": 0.2755, + "step": 356 + }, + { + "epoch": 0.7347568819140725, + "grad_norm": 0.2057684361934662, + "learning_rate": 1.404352806414662e-05, + "loss": 0.2668, + "step": 357 + }, + { + "epoch": 0.7368150244404424, + "grad_norm": 0.190156951546669, + "learning_rate": 1.402061855670103e-05, + "loss": 0.2778, + "step": 358 + }, + { + "epoch": 0.7388731669668125, + "grad_norm": 0.19387219846248627, + "learning_rate": 1.3997709049255441e-05, + "loss": 0.2785, + "step": 359 + }, + { + "epoch": 0.7409313094931824, + "grad_norm": 0.1933836042881012, + "learning_rate": 1.3974799541809852e-05, + "loss": 0.2661, + "step": 360 + }, + { + "epoch": 0.7429894520195524, + "grad_norm": 0.19618812203407288, + "learning_rate": 1.3951890034364261e-05, + "loss": 0.2622, + "step": 361 + }, + { + "epoch": 0.7450475945459223, + "grad_norm": 0.18786942958831787, + "learning_rate": 1.3928980526918671e-05, + "loss": 0.2695, + "step": 362 + }, + { + "epoch": 0.7471057370722922, + "grad_norm": 0.19361330568790436, + "learning_rate": 1.3906071019473082e-05, + "loss": 0.2869, + "step": 363 + }, + { + "epoch": 0.7491638795986622, + "grad_norm": 0.19813291728496552, + "learning_rate": 1.3883161512027493e-05, + "loss": 0.2753, + "step": 364 + }, + { + "epoch": 0.7512220221250322, + "grad_norm": 0.1891734004020691, + "learning_rate": 1.3860252004581902e-05, + "loss": 0.2694, + "step": 365 + }, + { + "epoch": 0.7532801646514021, + "grad_norm": 0.18902742862701416, + "learning_rate": 1.3837342497136312e-05, + "loss": 0.2675, + "step": 366 + }, + { + "epoch": 0.7553383071777721, + "grad_norm": 0.19838480651378632, + "learning_rate": 1.3814432989690723e-05, + "loss": 0.2721, + "step": 367 + }, + { + "epoch": 0.757396449704142, + "grad_norm": 0.20880939066410065, + "learning_rate": 1.3791523482245133e-05, + "loss": 0.2641, + "step": 368 + }, + { + "epoch": 0.7594545922305119, + "grad_norm": 0.20068003237247467, + "learning_rate": 1.3768613974799542e-05, + "loss": 0.2945, + "step": 369 + }, + { + "epoch": 0.7615127347568819, + "grad_norm": 0.19780132174491882, + "learning_rate": 1.3745704467353953e-05, + "loss": 0.2687, + "step": 370 + }, + { + "epoch": 0.7635708772832519, + "grad_norm": 0.19194689393043518, + "learning_rate": 1.3722794959908363e-05, + "loss": 0.2731, + "step": 371 + }, + { + "epoch": 0.7656290198096218, + "grad_norm": 0.19504573941230774, + "learning_rate": 1.3699885452462774e-05, + "loss": 0.2551, + "step": 372 + }, + { + "epoch": 0.7676871623359918, + "grad_norm": 0.18304413557052612, + "learning_rate": 1.3676975945017183e-05, + "loss": 0.2692, + "step": 373 + }, + { + "epoch": 0.7697453048623617, + "grad_norm": 0.2051483392715454, + "learning_rate": 1.3654066437571593e-05, + "loss": 0.2791, + "step": 374 + }, + { + "epoch": 0.7718034473887316, + "grad_norm": 0.18748973309993744, + "learning_rate": 1.3631156930126004e-05, + "loss": 0.2671, + "step": 375 + }, + { + "epoch": 0.7738615899151016, + "grad_norm": 0.19167177379131317, + "learning_rate": 1.3608247422680415e-05, + "loss": 0.2766, + "step": 376 + }, + { + "epoch": 0.7759197324414716, + "grad_norm": 0.17931750416755676, + "learning_rate": 1.3585337915234824e-05, + "loss": 0.2748, + "step": 377 + }, + { + "epoch": 0.7779778749678415, + "grad_norm": 0.19437509775161743, + "learning_rate": 1.3562428407789234e-05, + "loss": 0.2667, + "step": 378 + }, + { + "epoch": 0.7800360174942115, + "grad_norm": 0.19813868403434753, + "learning_rate": 1.3539518900343645e-05, + "loss": 0.2771, + "step": 379 + }, + { + "epoch": 0.7820941600205814, + "grad_norm": 0.19205260276794434, + "learning_rate": 1.3516609392898055e-05, + "loss": 0.2703, + "step": 380 + }, + { + "epoch": 0.7841523025469513, + "grad_norm": 0.19039763510227203, + "learning_rate": 1.3493699885452464e-05, + "loss": 0.264, + "step": 381 + }, + { + "epoch": 0.7862104450733213, + "grad_norm": 0.18269500136375427, + "learning_rate": 1.3470790378006875e-05, + "loss": 0.2653, + "step": 382 + }, + { + "epoch": 0.7882685875996913, + "grad_norm": 0.1922067403793335, + "learning_rate": 1.3447880870561285e-05, + "loss": 0.2754, + "step": 383 + }, + { + "epoch": 0.7903267301260612, + "grad_norm": 0.19615666568279266, + "learning_rate": 1.3424971363115693e-05, + "loss": 0.2811, + "step": 384 + }, + { + "epoch": 0.7923848726524312, + "grad_norm": 0.19037973880767822, + "learning_rate": 1.3402061855670103e-05, + "loss": 0.2673, + "step": 385 + }, + { + "epoch": 0.7944430151788011, + "grad_norm": 0.191124826669693, + "learning_rate": 1.3379152348224514e-05, + "loss": 0.2683, + "step": 386 + }, + { + "epoch": 0.796501157705171, + "grad_norm": 0.18429923057556152, + "learning_rate": 1.3356242840778923e-05, + "loss": 0.2698, + "step": 387 + }, + { + "epoch": 0.798559300231541, + "grad_norm": 0.1839045137166977, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.2895, + "step": 388 + }, + { + "epoch": 0.800617442757911, + "grad_norm": 0.1944131702184677, + "learning_rate": 1.3310423825887744e-05, + "loss": 0.2641, + "step": 389 + }, + { + "epoch": 0.802675585284281, + "grad_norm": 0.20407740771770477, + "learning_rate": 1.3287514318442154e-05, + "loss": 0.2743, + "step": 390 + }, + { + "epoch": 0.8047337278106509, + "grad_norm": 0.1814037561416626, + "learning_rate": 1.3264604810996563e-05, + "loss": 0.2672, + "step": 391 + }, + { + "epoch": 0.8067918703370208, + "grad_norm": 0.1886950582265854, + "learning_rate": 1.3241695303550974e-05, + "loss": 0.2725, + "step": 392 + }, + { + "epoch": 0.8088500128633908, + "grad_norm": 0.19429941475391388, + "learning_rate": 1.3218785796105385e-05, + "loss": 0.2669, + "step": 393 + }, + { + "epoch": 0.8109081553897607, + "grad_norm": 0.19143058359622955, + "learning_rate": 1.3195876288659795e-05, + "loss": 0.2659, + "step": 394 + }, + { + "epoch": 0.8129662979161307, + "grad_norm": 0.2213468849658966, + "learning_rate": 1.3172966781214204e-05, + "loss": 0.2764, + "step": 395 + }, + { + "epoch": 0.8150244404425007, + "grad_norm": 0.2040800005197525, + "learning_rate": 1.3150057273768615e-05, + "loss": 0.2783, + "step": 396 + }, + { + "epoch": 0.8170825829688706, + "grad_norm": 0.1948375254869461, + "learning_rate": 1.3127147766323025e-05, + "loss": 0.2689, + "step": 397 + }, + { + "epoch": 0.8191407254952405, + "grad_norm": 0.1915021538734436, + "learning_rate": 1.3104238258877436e-05, + "loss": 0.2808, + "step": 398 + }, + { + "epoch": 0.8211988680216105, + "grad_norm": 0.19760248064994812, + "learning_rate": 1.3081328751431845e-05, + "loss": 0.2712, + "step": 399 + }, + { + "epoch": 0.8232570105479804, + "grad_norm": 0.2082677185535431, + "learning_rate": 1.3058419243986255e-05, + "loss": 0.2707, + "step": 400 + }, + { + "epoch": 0.8232570105479804, + "eval_loss": 0.28778496384620667, + "eval_runtime": 2427.9096, + "eval_samples_per_second": 3.202, + "eval_steps_per_second": 0.801, + "step": 400 + } + ], + "logging_steps": 1, + "max_steps": 970, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3213273121404846e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/training_args.bin b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5999c7ee9dd10ee9076d748e4757533e635fa832 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee55a11f5a306eb7c39b536fdfe2459bc279e468da50f6adda478c4deffcb812 +size 5688 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/README.md b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0d77d70fdc5c829c8889cb85828736b7eb9714 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/codegemma-7b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/adapter_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e841602c6a59fc7b085ac647af4d4c312445d261 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/codegemma-7b-it", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "q_proj", + "down_proj", + "gate_proj", + "k_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/adapter_model.safetensors b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..20b801fa9a93b5e1489aa02e9b2aab0b3540f1b6 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04187853f7b16dc0eeb41765224d8436a0d897aebb8af23799cc5c7df1aecd42 +size 800116456 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/optimizer.pt b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6c05c948df05e38620c64681c3c238bd7291557 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:145159af8e1a2bed3a3aecd50e88ac82a30473c19708aad07c9abd1c09cd9a78 +size 406743860 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/rng_state.pth b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..aedc0590070e9ad692818dbc4191f8bf47f26b47 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf5ae8e08760a6bcc35a93a9322acff8f0c0f572968771d95f5b4ef2395c0900 +size 14244 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/scheduler.pt b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1316f19a3a539ed1fa426a060074257583adaf23 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bf5aa8c315cab01b65b6d26428d469f88fcea8e6aa6ad642d376e279b3eb1c1 +size 1064 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/special_tokens_map.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/tokenizer.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..45a5e23f54141c5f4f97a8d58f3ffadc28e287ba --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d964a2c8346d40f95791533eae48730d5f163c2e65fd16333560fd3e661df318 +size 34362915 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/tokenizer.model b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..71a98ce40269d847e58957e1e070d9ae8eb184af --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:583f2ebd2a1936009b7da991ea255504db68c7a9713a78673d1335a87098966c +size 4241023 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/tokenizer_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9b9b1b4acdd4afcedae39d1cf6f0bc7ef7d9910f --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/tokenizer_config.json @@ -0,0 +1,2011 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "<|file_separator|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "left", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/trainer_state.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..26f8849c7751adf7264210fc706b3e9931b9a8d7 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/trainer_state.json @@ -0,0 +1,3574 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0308721378955492, + "eval_steps": 100, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0020581425263699513, + "grad_norm": 11.994463920593262, + "learning_rate": 2.061855670103093e-07, + "loss": 2.91, + "step": 1 + }, + { + "epoch": 0.004116285052739903, + "grad_norm": 11.769092559814453, + "learning_rate": 4.123711340206186e-07, + "loss": 2.8686, + "step": 2 + }, + { + "epoch": 0.0061744275791098535, + "grad_norm": 13.05551815032959, + "learning_rate": 6.185567010309279e-07, + "loss": 3.0286, + "step": 3 + }, + { + "epoch": 0.008232570105479805, + "grad_norm": 12.334521293640137, + "learning_rate": 8.247422680412372e-07, + "loss": 2.904, + "step": 4 + }, + { + "epoch": 0.010290712631849755, + "grad_norm": 12.075353622436523, + "learning_rate": 1.0309278350515464e-06, + "loss": 2.8991, + "step": 5 + }, + { + "epoch": 0.012348855158219707, + "grad_norm": 11.86032485961914, + "learning_rate": 1.2371134020618557e-06, + "loss": 3.0007, + "step": 6 + }, + { + "epoch": 0.014406997684589657, + "grad_norm": 10.10457992553711, + "learning_rate": 1.4432989690721649e-06, + "loss": 2.8493, + "step": 7 + }, + { + "epoch": 0.01646514021095961, + "grad_norm": 8.56408405303955, + "learning_rate": 1.6494845360824744e-06, + "loss": 2.9573, + "step": 8 + }, + { + "epoch": 0.01852328273732956, + "grad_norm": 6.307392120361328, + "learning_rate": 1.8556701030927837e-06, + "loss": 2.9507, + "step": 9 + }, + { + "epoch": 0.02058142526369951, + "grad_norm": 4.276430130004883, + "learning_rate": 2.061855670103093e-06, + "loss": 2.8988, + "step": 10 + }, + { + "epoch": 0.022639567790069464, + "grad_norm": 2.5912015438079834, + "learning_rate": 2.268041237113402e-06, + "loss": 2.9926, + "step": 11 + }, + { + "epoch": 0.024697710316439414, + "grad_norm": 2.018446207046509, + "learning_rate": 2.4742268041237115e-06, + "loss": 2.9874, + "step": 12 + }, + { + "epoch": 0.026755852842809364, + "grad_norm": 1.8558588027954102, + "learning_rate": 2.680412371134021e-06, + "loss": 2.8608, + "step": 13 + }, + { + "epoch": 0.028813995369179314, + "grad_norm": 1.9658265113830566, + "learning_rate": 2.8865979381443297e-06, + "loss": 2.8596, + "step": 14 + }, + { + "epoch": 0.030872137895549268, + "grad_norm": 1.872044563293457, + "learning_rate": 3.0927835051546395e-06, + "loss": 2.8836, + "step": 15 + }, + { + "epoch": 0.03293028042191922, + "grad_norm": 1.8884096145629883, + "learning_rate": 3.298969072164949e-06, + "loss": 2.9383, + "step": 16 + }, + { + "epoch": 0.03498842294828917, + "grad_norm": 1.8795744180679321, + "learning_rate": 3.5051546391752577e-06, + "loss": 2.883, + "step": 17 + }, + { + "epoch": 0.03704656547465912, + "grad_norm": 1.783678412437439, + "learning_rate": 3.7113402061855674e-06, + "loss": 2.8019, + "step": 18 + }, + { + "epoch": 0.039104708001029075, + "grad_norm": 1.820617914199829, + "learning_rate": 3.917525773195877e-06, + "loss": 2.8813, + "step": 19 + }, + { + "epoch": 0.04116285052739902, + "grad_norm": 1.8188731670379639, + "learning_rate": 4.123711340206186e-06, + "loss": 2.8401, + "step": 20 + }, + { + "epoch": 0.043220993053768975, + "grad_norm": 1.7305251359939575, + "learning_rate": 4.329896907216495e-06, + "loss": 2.7478, + "step": 21 + }, + { + "epoch": 0.04527913558013893, + "grad_norm": 1.7014551162719727, + "learning_rate": 4.536082474226804e-06, + "loss": 2.7356, + "step": 22 + }, + { + "epoch": 0.047337278106508875, + "grad_norm": 1.677381157875061, + "learning_rate": 4.742268041237113e-06, + "loss": 2.7593, + "step": 23 + }, + { + "epoch": 0.04939542063287883, + "grad_norm": 1.628554344177246, + "learning_rate": 4.948453608247423e-06, + "loss": 2.7689, + "step": 24 + }, + { + "epoch": 0.051453563159248775, + "grad_norm": 1.4968128204345703, + "learning_rate": 5.154639175257732e-06, + "loss": 2.6613, + "step": 25 + }, + { + "epoch": 0.05351170568561873, + "grad_norm": 1.4734832048416138, + "learning_rate": 5.360824742268042e-06, + "loss": 2.7095, + "step": 26 + }, + { + "epoch": 0.05556984821198868, + "grad_norm": 1.3745571374893188, + "learning_rate": 5.567010309278351e-06, + "loss": 2.655, + "step": 27 + }, + { + "epoch": 0.05762799073835863, + "grad_norm": 1.3381729125976562, + "learning_rate": 5.7731958762886594e-06, + "loss": 2.55, + "step": 28 + }, + { + "epoch": 0.05968613326472858, + "grad_norm": 1.3388073444366455, + "learning_rate": 5.979381443298969e-06, + "loss": 2.5219, + "step": 29 + }, + { + "epoch": 0.061744275791098535, + "grad_norm": 1.317008376121521, + "learning_rate": 6.185567010309279e-06, + "loss": 2.4491, + "step": 30 + }, + { + "epoch": 0.06380241831746848, + "grad_norm": 1.3210794925689697, + "learning_rate": 6.391752577319588e-06, + "loss": 2.4358, + "step": 31 + }, + { + "epoch": 0.06586056084383844, + "grad_norm": 1.182519555091858, + "learning_rate": 6.597938144329898e-06, + "loss": 2.4514, + "step": 32 + }, + { + "epoch": 0.06791870337020839, + "grad_norm": 1.2238099575042725, + "learning_rate": 6.804123711340207e-06, + "loss": 2.442, + "step": 33 + }, + { + "epoch": 0.06997684589657834, + "grad_norm": 1.1793314218521118, + "learning_rate": 7.010309278350515e-06, + "loss": 2.3864, + "step": 34 + }, + { + "epoch": 0.0720349884229483, + "grad_norm": 1.1983020305633545, + "learning_rate": 7.216494845360825e-06, + "loss": 2.3796, + "step": 35 + }, + { + "epoch": 0.07409313094931824, + "grad_norm": 1.2189652919769287, + "learning_rate": 7.422680412371135e-06, + "loss": 2.4152, + "step": 36 + }, + { + "epoch": 0.07615127347568819, + "grad_norm": 1.14923095703125, + "learning_rate": 7.628865979381444e-06, + "loss": 2.3298, + "step": 37 + }, + { + "epoch": 0.07820941600205815, + "grad_norm": 1.147013545036316, + "learning_rate": 7.835051546391754e-06, + "loss": 2.2488, + "step": 38 + }, + { + "epoch": 0.0802675585284281, + "grad_norm": 1.133981466293335, + "learning_rate": 8.041237113402063e-06, + "loss": 2.1825, + "step": 39 + }, + { + "epoch": 0.08232570105479804, + "grad_norm": 1.1686867475509644, + "learning_rate": 8.247422680412371e-06, + "loss": 2.2282, + "step": 40 + }, + { + "epoch": 0.084383843581168, + "grad_norm": 1.131690502166748, + "learning_rate": 8.453608247422681e-06, + "loss": 2.0962, + "step": 41 + }, + { + "epoch": 0.08644198610753795, + "grad_norm": 1.1626195907592773, + "learning_rate": 8.65979381443299e-06, + "loss": 2.1161, + "step": 42 + }, + { + "epoch": 0.0885001286339079, + "grad_norm": 1.1508581638336182, + "learning_rate": 8.865979381443299e-06, + "loss": 1.9856, + "step": 43 + }, + { + "epoch": 0.09055827116027786, + "grad_norm": 1.2286733388900757, + "learning_rate": 9.072164948453609e-06, + "loss": 2.076, + "step": 44 + }, + { + "epoch": 0.0926164136866478, + "grad_norm": 1.82068932056427, + "learning_rate": 9.278350515463918e-06, + "loss": 1.9995, + "step": 45 + }, + { + "epoch": 0.09467455621301775, + "grad_norm": 2.079101324081421, + "learning_rate": 9.484536082474226e-06, + "loss": 1.9601, + "step": 46 + }, + { + "epoch": 0.0967326987393877, + "grad_norm": 1.1209226846694946, + "learning_rate": 9.690721649484536e-06, + "loss": 1.9346, + "step": 47 + }, + { + "epoch": 0.09879084126575766, + "grad_norm": 1.0579711198806763, + "learning_rate": 9.896907216494846e-06, + "loss": 1.8764, + "step": 48 + }, + { + "epoch": 0.1008489837921276, + "grad_norm": 1.0434011220932007, + "learning_rate": 1.0103092783505156e-05, + "loss": 1.8483, + "step": 49 + }, + { + "epoch": 0.10290712631849755, + "grad_norm": 1.0089991092681885, + "learning_rate": 1.0309278350515464e-05, + "loss": 1.8018, + "step": 50 + }, + { + "epoch": 0.10496526884486751, + "grad_norm": 1.0117324590682983, + "learning_rate": 1.0515463917525775e-05, + "loss": 1.8003, + "step": 51 + }, + { + "epoch": 0.10702341137123746, + "grad_norm": 1.0006697177886963, + "learning_rate": 1.0721649484536083e-05, + "loss": 1.7482, + "step": 52 + }, + { + "epoch": 0.1090815538976074, + "grad_norm": 2.1164329051971436, + "learning_rate": 1.0927835051546391e-05, + "loss": 1.7363, + "step": 53 + }, + { + "epoch": 0.11113969642397736, + "grad_norm": 0.9573502540588379, + "learning_rate": 1.1134020618556703e-05, + "loss": 1.661, + "step": 54 + }, + { + "epoch": 0.11319783895034731, + "grad_norm": 1.0059764385223389, + "learning_rate": 1.134020618556701e-05, + "loss": 1.6979, + "step": 55 + }, + { + "epoch": 0.11525598147671726, + "grad_norm": 0.9719656109809875, + "learning_rate": 1.1546391752577319e-05, + "loss": 1.6318, + "step": 56 + }, + { + "epoch": 0.11731412400308722, + "grad_norm": 1.0024539232254028, + "learning_rate": 1.175257731958763e-05, + "loss": 1.6283, + "step": 57 + }, + { + "epoch": 0.11937226652945716, + "grad_norm": 0.9772456288337708, + "learning_rate": 1.1958762886597938e-05, + "loss": 1.5611, + "step": 58 + }, + { + "epoch": 0.12143040905582711, + "grad_norm": 0.9947625994682312, + "learning_rate": 1.2164948453608248e-05, + "loss": 1.6073, + "step": 59 + }, + { + "epoch": 0.12348855158219707, + "grad_norm": 2.112889051437378, + "learning_rate": 1.2371134020618558e-05, + "loss": 1.6208, + "step": 60 + }, + { + "epoch": 0.12554669410856703, + "grad_norm": 1.0515345335006714, + "learning_rate": 1.2577319587628866e-05, + "loss": 1.569, + "step": 61 + }, + { + "epoch": 0.12760483663493696, + "grad_norm": 1.0782145261764526, + "learning_rate": 1.2783505154639176e-05, + "loss": 1.5097, + "step": 62 + }, + { + "epoch": 0.12966297916130692, + "grad_norm": 1.154104232788086, + "learning_rate": 1.2989690721649485e-05, + "loss": 1.5472, + "step": 63 + }, + { + "epoch": 0.13172112168767688, + "grad_norm": 1.1614656448364258, + "learning_rate": 1.3195876288659795e-05, + "loss": 1.4833, + "step": 64 + }, + { + "epoch": 0.13377926421404682, + "grad_norm": 1.1720911264419556, + "learning_rate": 1.3402061855670103e-05, + "loss": 1.4644, + "step": 65 + }, + { + "epoch": 0.13583740674041678, + "grad_norm": 1.8903896808624268, + "learning_rate": 1.3608247422680415e-05, + "loss": 1.4286, + "step": 66 + }, + { + "epoch": 0.13789554926678674, + "grad_norm": 1.2675013542175293, + "learning_rate": 1.3814432989690723e-05, + "loss": 1.416, + "step": 67 + }, + { + "epoch": 0.13995369179315667, + "grad_norm": 1.266434907913208, + "learning_rate": 1.402061855670103e-05, + "loss": 1.3171, + "step": 68 + }, + { + "epoch": 0.14201183431952663, + "grad_norm": 1.3408889770507812, + "learning_rate": 1.4226804123711342e-05, + "loss": 1.3396, + "step": 69 + }, + { + "epoch": 0.1440699768458966, + "grad_norm": 1.3862446546554565, + "learning_rate": 1.443298969072165e-05, + "loss": 1.2642, + "step": 70 + }, + { + "epoch": 0.14612811937226652, + "grad_norm": 2.110553026199341, + "learning_rate": 1.4639175257731958e-05, + "loss": 1.2593, + "step": 71 + }, + { + "epoch": 0.14818626189863648, + "grad_norm": 1.7017499208450317, + "learning_rate": 1.484536082474227e-05, + "loss": 1.24, + "step": 72 + }, + { + "epoch": 0.15024440442500644, + "grad_norm": 1.9851700067520142, + "learning_rate": 1.5051546391752578e-05, + "loss": 1.2313, + "step": 73 + }, + { + "epoch": 0.15230254695137638, + "grad_norm": 2.009608030319214, + "learning_rate": 1.5257731958762888e-05, + "loss": 1.1281, + "step": 74 + }, + { + "epoch": 0.15436068947774634, + "grad_norm": 2.7587485313415527, + "learning_rate": 1.5463917525773197e-05, + "loss": 1.1248, + "step": 75 + }, + { + "epoch": 0.1564188320041163, + "grad_norm": 2.780954599380493, + "learning_rate": 1.5670103092783507e-05, + "loss": 1.0797, + "step": 76 + }, + { + "epoch": 0.15847697453048623, + "grad_norm": 3.1470866203308105, + "learning_rate": 1.5876288659793813e-05, + "loss": 1.0064, + "step": 77 + }, + { + "epoch": 0.1605351170568562, + "grad_norm": 4.653595447540283, + "learning_rate": 1.6082474226804127e-05, + "loss": 0.9219, + "step": 78 + }, + { + "epoch": 0.16259325958322615, + "grad_norm": 4.157363414764404, + "learning_rate": 1.6288659793814433e-05, + "loss": 0.8709, + "step": 79 + }, + { + "epoch": 0.16465140210959608, + "grad_norm": 4.5814924240112305, + "learning_rate": 1.6494845360824743e-05, + "loss": 0.7693, + "step": 80 + }, + { + "epoch": 0.16670954463596604, + "grad_norm": 5.096139907836914, + "learning_rate": 1.6701030927835052e-05, + "loss": 0.6868, + "step": 81 + }, + { + "epoch": 0.168767687162336, + "grad_norm": 4.858880519866943, + "learning_rate": 1.6907216494845362e-05, + "loss": 0.5971, + "step": 82 + }, + { + "epoch": 0.17082582968870594, + "grad_norm": 4.42564582824707, + "learning_rate": 1.7113402061855672e-05, + "loss": 0.4719, + "step": 83 + }, + { + "epoch": 0.1728839722150759, + "grad_norm": 7.720851421356201, + "learning_rate": 1.731958762886598e-05, + "loss": 0.3943, + "step": 84 + }, + { + "epoch": 0.17494211474144586, + "grad_norm": 0.41923192143440247, + "learning_rate": 1.752577319587629e-05, + "loss": 0.3635, + "step": 85 + }, + { + "epoch": 0.1770002572678158, + "grad_norm": 0.2771846354007721, + "learning_rate": 1.7731958762886598e-05, + "loss": 0.3597, + "step": 86 + }, + { + "epoch": 0.17905839979418575, + "grad_norm": 0.24761857092380524, + "learning_rate": 1.793814432989691e-05, + "loss": 0.3735, + "step": 87 + }, + { + "epoch": 0.1811165423205557, + "grad_norm": 0.23277048766613007, + "learning_rate": 1.8144329896907217e-05, + "loss": 0.3643, + "step": 88 + }, + { + "epoch": 0.18317468484692565, + "grad_norm": 0.22931228578090668, + "learning_rate": 1.8350515463917527e-05, + "loss": 0.3519, + "step": 89 + }, + { + "epoch": 0.1852328273732956, + "grad_norm": 0.20750615000724792, + "learning_rate": 1.8556701030927837e-05, + "loss": 0.3431, + "step": 90 + }, + { + "epoch": 0.18729096989966554, + "grad_norm": 0.2080322951078415, + "learning_rate": 1.8762886597938147e-05, + "loss": 0.3632, + "step": 91 + }, + { + "epoch": 0.1893491124260355, + "grad_norm": 0.20186181366443634, + "learning_rate": 1.8969072164948453e-05, + "loss": 0.3492, + "step": 92 + }, + { + "epoch": 0.19140725495240546, + "grad_norm": 0.19172786176204681, + "learning_rate": 1.9175257731958766e-05, + "loss": 0.3552, + "step": 93 + }, + { + "epoch": 0.1934653974787754, + "grad_norm": 0.1747850626707077, + "learning_rate": 1.9381443298969072e-05, + "loss": 0.3355, + "step": 94 + }, + { + "epoch": 0.19552354000514535, + "grad_norm": 0.196411594748497, + "learning_rate": 1.9587628865979382e-05, + "loss": 0.3271, + "step": 95 + }, + { + "epoch": 0.1975816825315153, + "grad_norm": 0.20063228905200958, + "learning_rate": 1.9793814432989692e-05, + "loss": 0.3351, + "step": 96 + }, + { + "epoch": 0.19963982505788525, + "grad_norm": 0.19240939617156982, + "learning_rate": 2e-05, + "loss": 0.3266, + "step": 97 + }, + { + "epoch": 0.2016979675842552, + "grad_norm": 0.18206572532653809, + "learning_rate": 1.997709049255441e-05, + "loss": 0.3393, + "step": 98 + }, + { + "epoch": 0.20375611011062517, + "grad_norm": 0.20384562015533447, + "learning_rate": 1.9954180985108823e-05, + "loss": 0.3395, + "step": 99 + }, + { + "epoch": 0.2058142526369951, + "grad_norm": 0.19944581389427185, + "learning_rate": 1.9931271477663232e-05, + "loss": 0.3268, + "step": 100 + }, + { + "epoch": 0.2058142526369951, + "eval_loss": 0.3456890285015106, + "eval_runtime": 2114.0178, + "eval_samples_per_second": 3.677, + "eval_steps_per_second": 0.92, + "step": 100 + }, + { + "epoch": 0.20787239516336506, + "grad_norm": 0.17743557691574097, + "learning_rate": 1.990836197021764e-05, + "loss": 0.3439, + "step": 101 + }, + { + "epoch": 0.20993053768973502, + "grad_norm": 0.18746449053287506, + "learning_rate": 1.9885452462772053e-05, + "loss": 0.326, + "step": 102 + }, + { + "epoch": 0.21198868021610495, + "grad_norm": 0.18555815517902374, + "learning_rate": 1.9862542955326462e-05, + "loss": 0.3337, + "step": 103 + }, + { + "epoch": 0.2140468227424749, + "grad_norm": 0.16591575741767883, + "learning_rate": 1.9839633447880874e-05, + "loss": 0.3121, + "step": 104 + }, + { + "epoch": 0.21610496526884487, + "grad_norm": 0.1621987372636795, + "learning_rate": 1.9816723940435283e-05, + "loss": 0.3287, + "step": 105 + }, + { + "epoch": 0.2181631077952148, + "grad_norm": 0.1614532470703125, + "learning_rate": 1.9793814432989692e-05, + "loss": 0.3306, + "step": 106 + }, + { + "epoch": 0.22022125032158477, + "grad_norm": 0.17993387579917908, + "learning_rate": 1.9770904925544104e-05, + "loss": 0.3341, + "step": 107 + }, + { + "epoch": 0.22227939284795473, + "grad_norm": 0.1550011783838272, + "learning_rate": 1.9747995418098513e-05, + "loss": 0.3197, + "step": 108 + }, + { + "epoch": 0.22433753537432466, + "grad_norm": 0.18471524119377136, + "learning_rate": 1.9725085910652922e-05, + "loss": 0.3285, + "step": 109 + }, + { + "epoch": 0.22639567790069462, + "grad_norm": 0.15604373812675476, + "learning_rate": 1.9702176403207334e-05, + "loss": 0.3298, + "step": 110 + }, + { + "epoch": 0.22845382042706458, + "grad_norm": 0.1682298630475998, + "learning_rate": 1.9679266895761743e-05, + "loss": 0.3343, + "step": 111 + }, + { + "epoch": 0.2305119629534345, + "grad_norm": 0.14933635294437408, + "learning_rate": 1.9656357388316152e-05, + "loss": 0.3134, + "step": 112 + }, + { + "epoch": 0.23257010547980447, + "grad_norm": 0.14892347157001495, + "learning_rate": 1.963344788087056e-05, + "loss": 0.3154, + "step": 113 + }, + { + "epoch": 0.23462824800617443, + "grad_norm": 0.1577889323234558, + "learning_rate": 1.9610538373424973e-05, + "loss": 0.3122, + "step": 114 + }, + { + "epoch": 0.23668639053254437, + "grad_norm": 0.16482344269752502, + "learning_rate": 1.9587628865979382e-05, + "loss": 0.3193, + "step": 115 + }, + { + "epoch": 0.23874453305891433, + "grad_norm": 0.15328913927078247, + "learning_rate": 1.956471935853379e-05, + "loss": 0.3217, + "step": 116 + }, + { + "epoch": 0.2408026755852843, + "grad_norm": 0.16140656173229218, + "learning_rate": 1.9541809851088203e-05, + "loss": 0.318, + "step": 117 + }, + { + "epoch": 0.24286081811165422, + "grad_norm": 0.15448373556137085, + "learning_rate": 1.9518900343642612e-05, + "loss": 0.3205, + "step": 118 + }, + { + "epoch": 0.24491896063802418, + "grad_norm": 0.14716887474060059, + "learning_rate": 1.9495990836197025e-05, + "loss": 0.3164, + "step": 119 + }, + { + "epoch": 0.24697710316439414, + "grad_norm": 0.16582027077674866, + "learning_rate": 1.9473081328751433e-05, + "loss": 0.3191, + "step": 120 + }, + { + "epoch": 0.24903524569076407, + "grad_norm": 0.15213699638843536, + "learning_rate": 1.9450171821305842e-05, + "loss": 0.304, + "step": 121 + }, + { + "epoch": 0.25109338821713406, + "grad_norm": 0.1659238487482071, + "learning_rate": 1.9427262313860255e-05, + "loss": 0.3184, + "step": 122 + }, + { + "epoch": 0.253151530743504, + "grad_norm": 0.15596656501293182, + "learning_rate": 1.9404352806414663e-05, + "loss": 0.3092, + "step": 123 + }, + { + "epoch": 0.2552096732698739, + "grad_norm": 0.15868476033210754, + "learning_rate": 1.9381443298969072e-05, + "loss": 0.3163, + "step": 124 + }, + { + "epoch": 0.2572678157962439, + "grad_norm": 0.15386095643043518, + "learning_rate": 1.9358533791523485e-05, + "loss": 0.3049, + "step": 125 + }, + { + "epoch": 0.25932595832261385, + "grad_norm": 0.15179213881492615, + "learning_rate": 1.9335624284077894e-05, + "loss": 0.3131, + "step": 126 + }, + { + "epoch": 0.2613841008489838, + "grad_norm": 0.1595134735107422, + "learning_rate": 1.9312714776632306e-05, + "loss": 0.3069, + "step": 127 + }, + { + "epoch": 0.26344224337535377, + "grad_norm": 0.16989803314208984, + "learning_rate": 1.9289805269186715e-05, + "loss": 0.3052, + "step": 128 + }, + { + "epoch": 0.2655003859017237, + "grad_norm": 0.14803892374038696, + "learning_rate": 1.9266895761741124e-05, + "loss": 0.3065, + "step": 129 + }, + { + "epoch": 0.26755852842809363, + "grad_norm": 0.16676583886146545, + "learning_rate": 1.9243986254295536e-05, + "loss": 0.2962, + "step": 130 + }, + { + "epoch": 0.2696166709544636, + "grad_norm": 0.15694552659988403, + "learning_rate": 1.9221076746849945e-05, + "loss": 0.3096, + "step": 131 + }, + { + "epoch": 0.27167481348083355, + "grad_norm": 0.17696696519851685, + "learning_rate": 1.9198167239404354e-05, + "loss": 0.3145, + "step": 132 + }, + { + "epoch": 0.2737329560072035, + "grad_norm": 0.17204038798809052, + "learning_rate": 1.9175257731958766e-05, + "loss": 0.3248, + "step": 133 + }, + { + "epoch": 0.2757910985335735, + "grad_norm": 0.15630359947681427, + "learning_rate": 1.9152348224513175e-05, + "loss": 0.3117, + "step": 134 + }, + { + "epoch": 0.2778492410599434, + "grad_norm": 0.15757997334003448, + "learning_rate": 1.9129438717067584e-05, + "loss": 0.3145, + "step": 135 + }, + { + "epoch": 0.27990738358631334, + "grad_norm": 0.16273653507232666, + "learning_rate": 1.9106529209621996e-05, + "loss": 0.3159, + "step": 136 + }, + { + "epoch": 0.28196552611268333, + "grad_norm": 0.16213104128837585, + "learning_rate": 1.9083619702176405e-05, + "loss": 0.2949, + "step": 137 + }, + { + "epoch": 0.28402366863905326, + "grad_norm": 0.15377865731716156, + "learning_rate": 1.9060710194730814e-05, + "loss": 0.306, + "step": 138 + }, + { + "epoch": 0.2860818111654232, + "grad_norm": 0.1545962244272232, + "learning_rate": 1.9037800687285223e-05, + "loss": 0.2966, + "step": 139 + }, + { + "epoch": 0.2881399536917932, + "grad_norm": 0.15516617894172668, + "learning_rate": 1.9014891179839635e-05, + "loss": 0.3122, + "step": 140 + }, + { + "epoch": 0.2901980962181631, + "grad_norm": 0.14734458923339844, + "learning_rate": 1.8991981672394044e-05, + "loss": 0.3118, + "step": 141 + }, + { + "epoch": 0.29225623874453305, + "grad_norm": 0.1644304096698761, + "learning_rate": 1.8969072164948453e-05, + "loss": 0.3027, + "step": 142 + }, + { + "epoch": 0.29431438127090304, + "grad_norm": 0.14632569253444672, + "learning_rate": 1.8946162657502865e-05, + "loss": 0.3023, + "step": 143 + }, + { + "epoch": 0.29637252379727297, + "grad_norm": 0.1573137789964676, + "learning_rate": 1.8923253150057274e-05, + "loss": 0.3102, + "step": 144 + }, + { + "epoch": 0.2984306663236429, + "grad_norm": 0.16423144936561584, + "learning_rate": 1.8900343642611686e-05, + "loss": 0.3033, + "step": 145 + }, + { + "epoch": 0.3004888088500129, + "grad_norm": 0.15420907735824585, + "learning_rate": 1.8877434135166095e-05, + "loss": 0.3089, + "step": 146 + }, + { + "epoch": 0.3025469513763828, + "grad_norm": 0.1579178273677826, + "learning_rate": 1.8854524627720504e-05, + "loss": 0.3071, + "step": 147 + }, + { + "epoch": 0.30460509390275275, + "grad_norm": 0.15866397321224213, + "learning_rate": 1.8831615120274916e-05, + "loss": 0.3083, + "step": 148 + }, + { + "epoch": 0.30666323642912274, + "grad_norm": 0.16651487350463867, + "learning_rate": 1.8808705612829325e-05, + "loss": 0.3099, + "step": 149 + }, + { + "epoch": 0.3087213789554927, + "grad_norm": 0.16281908750534058, + "learning_rate": 1.8785796105383734e-05, + "loss": 0.3034, + "step": 150 + }, + { + "epoch": 0.3107795214818626, + "grad_norm": 0.17449837923049927, + "learning_rate": 1.8762886597938147e-05, + "loss": 0.3054, + "step": 151 + }, + { + "epoch": 0.3128376640082326, + "grad_norm": 0.15403546392917633, + "learning_rate": 1.8739977090492555e-05, + "loss": 0.297, + "step": 152 + }, + { + "epoch": 0.31489580653460253, + "grad_norm": 0.1472466140985489, + "learning_rate": 1.8717067583046968e-05, + "loss": 0.2973, + "step": 153 + }, + { + "epoch": 0.31695394906097246, + "grad_norm": 0.16027937829494476, + "learning_rate": 1.8694158075601377e-05, + "loss": 0.3054, + "step": 154 + }, + { + "epoch": 0.31901209158734245, + "grad_norm": 0.17086225748062134, + "learning_rate": 1.8671248568155786e-05, + "loss": 0.307, + "step": 155 + }, + { + "epoch": 0.3210702341137124, + "grad_norm": 0.15930697321891785, + "learning_rate": 1.8648339060710198e-05, + "loss": 0.293, + "step": 156 + }, + { + "epoch": 0.3231283766400823, + "grad_norm": 0.17086376249790192, + "learning_rate": 1.8625429553264607e-05, + "loss": 0.293, + "step": 157 + }, + { + "epoch": 0.3251865191664523, + "grad_norm": 0.15970875322818756, + "learning_rate": 1.8602520045819016e-05, + "loss": 0.3083, + "step": 158 + }, + { + "epoch": 0.32724466169282224, + "grad_norm": 0.16355909407138824, + "learning_rate": 1.8579610538373428e-05, + "loss": 0.3139, + "step": 159 + }, + { + "epoch": 0.32930280421919217, + "grad_norm": 0.15183711051940918, + "learning_rate": 1.8556701030927837e-05, + "loss": 0.2953, + "step": 160 + }, + { + "epoch": 0.33136094674556216, + "grad_norm": 0.15123715996742249, + "learning_rate": 1.853379152348225e-05, + "loss": 0.3025, + "step": 161 + }, + { + "epoch": 0.3334190892719321, + "grad_norm": 0.1576143503189087, + "learning_rate": 1.8510882016036658e-05, + "loss": 0.2904, + "step": 162 + }, + { + "epoch": 0.335477231798302, + "grad_norm": 0.1457504779100418, + "learning_rate": 1.8487972508591067e-05, + "loss": 0.2909, + "step": 163 + }, + { + "epoch": 0.337535374324672, + "grad_norm": 0.1557442992925644, + "learning_rate": 1.846506300114548e-05, + "loss": 0.3027, + "step": 164 + }, + { + "epoch": 0.33959351685104194, + "grad_norm": 0.15662318468093872, + "learning_rate": 1.8442153493699888e-05, + "loss": 0.311, + "step": 165 + }, + { + "epoch": 0.3416516593774119, + "grad_norm": 0.16177058219909668, + "learning_rate": 1.8419243986254297e-05, + "loss": 0.2944, + "step": 166 + }, + { + "epoch": 0.34370980190378186, + "grad_norm": 0.16406729817390442, + "learning_rate": 1.8396334478808706e-05, + "loss": 0.2927, + "step": 167 + }, + { + "epoch": 0.3457679444301518, + "grad_norm": 0.16642791032791138, + "learning_rate": 1.8373424971363115e-05, + "loss": 0.3063, + "step": 168 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.1650693714618683, + "learning_rate": 1.8350515463917527e-05, + "loss": 0.2957, + "step": 169 + }, + { + "epoch": 0.3498842294828917, + "grad_norm": 0.15349675714969635, + "learning_rate": 1.8327605956471936e-05, + "loss": 0.297, + "step": 170 + }, + { + "epoch": 0.35194237200926165, + "grad_norm": 0.17770209908485413, + "learning_rate": 1.8304696449026348e-05, + "loss": 0.3011, + "step": 171 + }, + { + "epoch": 0.3540005145356316, + "grad_norm": 0.1647631675004959, + "learning_rate": 1.8281786941580757e-05, + "loss": 0.2962, + "step": 172 + }, + { + "epoch": 0.35605865706200157, + "grad_norm": 0.1603834480047226, + "learning_rate": 1.8258877434135166e-05, + "loss": 0.2937, + "step": 173 + }, + { + "epoch": 0.3581167995883715, + "grad_norm": 0.16780880093574524, + "learning_rate": 1.8235967926689578e-05, + "loss": 0.2997, + "step": 174 + }, + { + "epoch": 0.36017494211474144, + "grad_norm": 0.15976767241954803, + "learning_rate": 1.8213058419243987e-05, + "loss": 0.3043, + "step": 175 + }, + { + "epoch": 0.3622330846411114, + "grad_norm": 0.16236485540866852, + "learning_rate": 1.8190148911798396e-05, + "loss": 0.3069, + "step": 176 + }, + { + "epoch": 0.36429122716748136, + "grad_norm": 0.16391968727111816, + "learning_rate": 1.816723940435281e-05, + "loss": 0.2923, + "step": 177 + }, + { + "epoch": 0.3663493696938513, + "grad_norm": 0.15806889533996582, + "learning_rate": 1.8144329896907217e-05, + "loss": 0.2872, + "step": 178 + }, + { + "epoch": 0.3684075122202212, + "grad_norm": 0.1627352088689804, + "learning_rate": 1.812142038946163e-05, + "loss": 0.3032, + "step": 179 + }, + { + "epoch": 0.3704656547465912, + "grad_norm": 0.15103371441364288, + "learning_rate": 1.809851088201604e-05, + "loss": 0.2847, + "step": 180 + }, + { + "epoch": 0.37252379727296114, + "grad_norm": 0.15178488194942474, + "learning_rate": 1.8075601374570447e-05, + "loss": 0.3017, + "step": 181 + }, + { + "epoch": 0.3745819397993311, + "grad_norm": 0.15493899583816528, + "learning_rate": 1.805269186712486e-05, + "loss": 0.2901, + "step": 182 + }, + { + "epoch": 0.37664008232570106, + "grad_norm": 0.15990686416625977, + "learning_rate": 1.802978235967927e-05, + "loss": 0.2861, + "step": 183 + }, + { + "epoch": 0.378698224852071, + "grad_norm": 0.15824148058891296, + "learning_rate": 1.8006872852233677e-05, + "loss": 0.2885, + "step": 184 + }, + { + "epoch": 0.38075636737844093, + "grad_norm": 0.15690775215625763, + "learning_rate": 1.798396334478809e-05, + "loss": 0.2814, + "step": 185 + }, + { + "epoch": 0.3828145099048109, + "grad_norm": 0.15833796560764313, + "learning_rate": 1.79610538373425e-05, + "loss": 0.2847, + "step": 186 + }, + { + "epoch": 0.38487265243118085, + "grad_norm": 0.16560044884681702, + "learning_rate": 1.793814432989691e-05, + "loss": 0.3061, + "step": 187 + }, + { + "epoch": 0.3869307949575508, + "grad_norm": 0.16240179538726807, + "learning_rate": 1.791523482245132e-05, + "loss": 0.2943, + "step": 188 + }, + { + "epoch": 0.38898893748392077, + "grad_norm": 0.15825721621513367, + "learning_rate": 1.789232531500573e-05, + "loss": 0.2934, + "step": 189 + }, + { + "epoch": 0.3910470800102907, + "grad_norm": 0.16665388643741608, + "learning_rate": 1.786941580756014e-05, + "loss": 0.291, + "step": 190 + }, + { + "epoch": 0.39310522253666064, + "grad_norm": 0.16581200063228607, + "learning_rate": 1.784650630011455e-05, + "loss": 0.2849, + "step": 191 + }, + { + "epoch": 0.3951633650630306, + "grad_norm": 0.1604345291852951, + "learning_rate": 1.782359679266896e-05, + "loss": 0.3, + "step": 192 + }, + { + "epoch": 0.39722150758940056, + "grad_norm": 0.16107915341854095, + "learning_rate": 1.7800687285223368e-05, + "loss": 0.2847, + "step": 193 + }, + { + "epoch": 0.3992796501157705, + "grad_norm": 0.1571730375289917, + "learning_rate": 1.7777777777777777e-05, + "loss": 0.2863, + "step": 194 + }, + { + "epoch": 0.4013377926421405, + "grad_norm": 0.1656399518251419, + "learning_rate": 1.775486827033219e-05, + "loss": 0.2878, + "step": 195 + }, + { + "epoch": 0.4033959351685104, + "grad_norm": 0.16738460958003998, + "learning_rate": 1.7731958762886598e-05, + "loss": 0.286, + "step": 196 + }, + { + "epoch": 0.40545407769488034, + "grad_norm": 0.16704292595386505, + "learning_rate": 1.770904925544101e-05, + "loss": 0.2919, + "step": 197 + }, + { + "epoch": 0.40751222022125033, + "grad_norm": 0.16215579211711884, + "learning_rate": 1.768613974799542e-05, + "loss": 0.2874, + "step": 198 + }, + { + "epoch": 0.40957036274762026, + "grad_norm": 0.15573479235172272, + "learning_rate": 1.7663230240549828e-05, + "loss": 0.2904, + "step": 199 + }, + { + "epoch": 0.4116285052739902, + "grad_norm": 0.1707623153924942, + "learning_rate": 1.764032073310424e-05, + "loss": 0.289, + "step": 200 + }, + { + "epoch": 0.4116285052739902, + "eval_loss": 0.3214050829410553, + "eval_runtime": 2449.7742, + "eval_samples_per_second": 3.173, + "eval_steps_per_second": 0.794, + "step": 200 + }, + { + "epoch": 0.4136866478003602, + "grad_norm": 0.1699172556400299, + "learning_rate": 1.761741122565865e-05, + "loss": 0.2852, + "step": 201 + }, + { + "epoch": 0.4157447903267301, + "grad_norm": 0.19150058925151825, + "learning_rate": 1.7594501718213058e-05, + "loss": 0.29, + "step": 202 + }, + { + "epoch": 0.41780293285310005, + "grad_norm": 0.15794627368450165, + "learning_rate": 1.757159221076747e-05, + "loss": 0.2746, + "step": 203 + }, + { + "epoch": 0.41986107537947004, + "grad_norm": 0.17305190861225128, + "learning_rate": 1.754868270332188e-05, + "loss": 0.3003, + "step": 204 + }, + { + "epoch": 0.42191921790583997, + "grad_norm": 0.16257523000240326, + "learning_rate": 1.752577319587629e-05, + "loss": 0.2789, + "step": 205 + }, + { + "epoch": 0.4239773604322099, + "grad_norm": 0.17273619771003723, + "learning_rate": 1.75028636884307e-05, + "loss": 0.2917, + "step": 206 + }, + { + "epoch": 0.4260355029585799, + "grad_norm": 0.17502790689468384, + "learning_rate": 1.747995418098511e-05, + "loss": 0.2992, + "step": 207 + }, + { + "epoch": 0.4280936454849498, + "grad_norm": 0.16464050114154816, + "learning_rate": 1.745704467353952e-05, + "loss": 0.2873, + "step": 208 + }, + { + "epoch": 0.43015178801131976, + "grad_norm": 0.1681668758392334, + "learning_rate": 1.743413516609393e-05, + "loss": 0.2991, + "step": 209 + }, + { + "epoch": 0.43220993053768975, + "grad_norm": 0.16957956552505493, + "learning_rate": 1.741122565864834e-05, + "loss": 0.2868, + "step": 210 + }, + { + "epoch": 0.4342680730640597, + "grad_norm": 0.15875883400440216, + "learning_rate": 1.738831615120275e-05, + "loss": 0.2946, + "step": 211 + }, + { + "epoch": 0.4363262155904296, + "grad_norm": 0.18127889931201935, + "learning_rate": 1.736540664375716e-05, + "loss": 0.2835, + "step": 212 + }, + { + "epoch": 0.4383843581167996, + "grad_norm": 0.17822811007499695, + "learning_rate": 1.7342497136311573e-05, + "loss": 0.2944, + "step": 213 + }, + { + "epoch": 0.44044250064316953, + "grad_norm": 0.17555806040763855, + "learning_rate": 1.731958762886598e-05, + "loss": 0.3001, + "step": 214 + }, + { + "epoch": 0.44250064316953946, + "grad_norm": 0.18709121644496918, + "learning_rate": 1.729667812142039e-05, + "loss": 0.282, + "step": 215 + }, + { + "epoch": 0.44455878569590945, + "grad_norm": 0.16322475671768188, + "learning_rate": 1.7273768613974803e-05, + "loss": 0.2883, + "step": 216 + }, + { + "epoch": 0.4466169282222794, + "grad_norm": 0.1677054911851883, + "learning_rate": 1.7250859106529212e-05, + "loss": 0.28, + "step": 217 + }, + { + "epoch": 0.4486750707486493, + "grad_norm": 0.15764063596725464, + "learning_rate": 1.722794959908362e-05, + "loss": 0.2768, + "step": 218 + }, + { + "epoch": 0.4507332132750193, + "grad_norm": 0.16166841983795166, + "learning_rate": 1.7205040091638033e-05, + "loss": 0.2868, + "step": 219 + }, + { + "epoch": 0.45279135580138924, + "grad_norm": 0.1799350380897522, + "learning_rate": 1.7182130584192442e-05, + "loss": 0.2891, + "step": 220 + }, + { + "epoch": 0.45484949832775917, + "grad_norm": 0.18119174242019653, + "learning_rate": 1.715922107674685e-05, + "loss": 0.2841, + "step": 221 + }, + { + "epoch": 0.45690764085412916, + "grad_norm": 0.17725548148155212, + "learning_rate": 1.713631156930126e-05, + "loss": 0.3038, + "step": 222 + }, + { + "epoch": 0.4589657833804991, + "grad_norm": 0.1628233790397644, + "learning_rate": 1.7113402061855672e-05, + "loss": 0.2868, + "step": 223 + }, + { + "epoch": 0.461023925906869, + "grad_norm": 0.1745166927576065, + "learning_rate": 1.709049255441008e-05, + "loss": 0.3033, + "step": 224 + }, + { + "epoch": 0.463082068433239, + "grad_norm": 0.17708267271518707, + "learning_rate": 1.706758304696449e-05, + "loss": 0.2842, + "step": 225 + }, + { + "epoch": 0.46514021095960895, + "grad_norm": 0.1738453358411789, + "learning_rate": 1.7044673539518902e-05, + "loss": 0.3005, + "step": 226 + }, + { + "epoch": 0.4671983534859789, + "grad_norm": 0.1706874966621399, + "learning_rate": 1.702176403207331e-05, + "loss": 0.2924, + "step": 227 + }, + { + "epoch": 0.46925649601234887, + "grad_norm": 0.1697423756122589, + "learning_rate": 1.699885452462772e-05, + "loss": 0.2783, + "step": 228 + }, + { + "epoch": 0.4713146385387188, + "grad_norm": 0.1783403754234314, + "learning_rate": 1.6975945017182132e-05, + "loss": 0.2924, + "step": 229 + }, + { + "epoch": 0.47337278106508873, + "grad_norm": 0.17431536316871643, + "learning_rate": 1.695303550973654e-05, + "loss": 0.2792, + "step": 230 + }, + { + "epoch": 0.4754309235914587, + "grad_norm": 0.164026141166687, + "learning_rate": 1.6930126002290953e-05, + "loss": 0.2825, + "step": 231 + }, + { + "epoch": 0.47748906611782865, + "grad_norm": 0.16449657082557678, + "learning_rate": 1.6907216494845362e-05, + "loss": 0.2831, + "step": 232 + }, + { + "epoch": 0.4795472086441986, + "grad_norm": 0.1812741607427597, + "learning_rate": 1.688430698739977e-05, + "loss": 0.2849, + "step": 233 + }, + { + "epoch": 0.4816053511705686, + "grad_norm": 0.18431834876537323, + "learning_rate": 1.6861397479954183e-05, + "loss": 0.2802, + "step": 234 + }, + { + "epoch": 0.4836634936969385, + "grad_norm": 0.18349015712738037, + "learning_rate": 1.6838487972508592e-05, + "loss": 0.2804, + "step": 235 + }, + { + "epoch": 0.48572163622330844, + "grad_norm": 0.1769968420267105, + "learning_rate": 1.6815578465063e-05, + "loss": 0.2777, + "step": 236 + }, + { + "epoch": 0.4877797787496784, + "grad_norm": 0.17207500338554382, + "learning_rate": 1.6792668957617413e-05, + "loss": 0.2883, + "step": 237 + }, + { + "epoch": 0.48983792127604836, + "grad_norm": 0.1729692667722702, + "learning_rate": 1.6769759450171822e-05, + "loss": 0.2784, + "step": 238 + }, + { + "epoch": 0.4918960638024183, + "grad_norm": 0.17234881222248077, + "learning_rate": 1.6746849942726235e-05, + "loss": 0.2816, + "step": 239 + }, + { + "epoch": 0.4939542063287883, + "grad_norm": 0.17132551968097687, + "learning_rate": 1.6723940435280644e-05, + "loss": 0.2812, + "step": 240 + }, + { + "epoch": 0.4960123488551582, + "grad_norm": 0.1752254068851471, + "learning_rate": 1.6701030927835052e-05, + "loss": 0.2799, + "step": 241 + }, + { + "epoch": 0.49807049138152815, + "grad_norm": 0.1768665313720703, + "learning_rate": 1.6678121420389465e-05, + "loss": 0.2966, + "step": 242 + }, + { + "epoch": 0.5001286339078981, + "grad_norm": 0.18139514327049255, + "learning_rate": 1.6655211912943874e-05, + "loss": 0.2816, + "step": 243 + }, + { + "epoch": 0.5021867764342681, + "grad_norm": 0.17312943935394287, + "learning_rate": 1.6632302405498283e-05, + "loss": 0.2845, + "step": 244 + }, + { + "epoch": 0.5042449189606381, + "grad_norm": 0.17966389656066895, + "learning_rate": 1.6609392898052695e-05, + "loss": 0.2864, + "step": 245 + }, + { + "epoch": 0.506303061487008, + "grad_norm": 0.16653811931610107, + "learning_rate": 1.6586483390607104e-05, + "loss": 0.2759, + "step": 246 + }, + { + "epoch": 0.5083612040133779, + "grad_norm": 0.1634613424539566, + "learning_rate": 1.6563573883161516e-05, + "loss": 0.2728, + "step": 247 + }, + { + "epoch": 0.5104193465397479, + "grad_norm": 0.17358507215976715, + "learning_rate": 1.654066437571592e-05, + "loss": 0.2706, + "step": 248 + }, + { + "epoch": 0.5124774890661178, + "grad_norm": 0.17524316906929016, + "learning_rate": 1.6517754868270334e-05, + "loss": 0.2805, + "step": 249 + }, + { + "epoch": 0.5145356315924878, + "grad_norm": 0.18134094774723053, + "learning_rate": 1.6494845360824743e-05, + "loss": 0.2909, + "step": 250 + }, + { + "epoch": 0.5165937741188578, + "grad_norm": 0.17795510590076447, + "learning_rate": 1.647193585337915e-05, + "loss": 0.2889, + "step": 251 + }, + { + "epoch": 0.5186519166452277, + "grad_norm": 0.16782547533512115, + "learning_rate": 1.6449026345933564e-05, + "loss": 0.2842, + "step": 252 + }, + { + "epoch": 0.5207100591715976, + "grad_norm": 0.17360062897205353, + "learning_rate": 1.6426116838487973e-05, + "loss": 0.2763, + "step": 253 + }, + { + "epoch": 0.5227682016979676, + "grad_norm": 0.17241406440734863, + "learning_rate": 1.6403207331042385e-05, + "loss": 0.2753, + "step": 254 + }, + { + "epoch": 0.5248263442243375, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.6380297823596794e-05, + "loss": 0.2732, + "step": 255 + }, + { + "epoch": 0.5268844867507075, + "grad_norm": 0.1807374209165573, + "learning_rate": 1.6357388316151203e-05, + "loss": 0.2856, + "step": 256 + }, + { + "epoch": 0.5289426292770775, + "grad_norm": 0.1749904304742813, + "learning_rate": 1.6334478808705615e-05, + "loss": 0.285, + "step": 257 + }, + { + "epoch": 0.5310007718034474, + "grad_norm": 0.16673170030117035, + "learning_rate": 1.6311569301260024e-05, + "loss": 0.2825, + "step": 258 + }, + { + "epoch": 0.5330589143298173, + "grad_norm": 0.17239685356616974, + "learning_rate": 1.6288659793814433e-05, + "loss": 0.2845, + "step": 259 + }, + { + "epoch": 0.5351170568561873, + "grad_norm": 0.1831504851579666, + "learning_rate": 1.6265750286368845e-05, + "loss": 0.2859, + "step": 260 + }, + { + "epoch": 0.5371751993825572, + "grad_norm": 0.18507827818393707, + "learning_rate": 1.6242840778923254e-05, + "loss": 0.293, + "step": 261 + }, + { + "epoch": 0.5392333419089272, + "grad_norm": 0.16738134622573853, + "learning_rate": 1.6219931271477663e-05, + "loss": 0.2853, + "step": 262 + }, + { + "epoch": 0.5412914844352972, + "grad_norm": 0.1701226830482483, + "learning_rate": 1.6197021764032075e-05, + "loss": 0.2763, + "step": 263 + }, + { + "epoch": 0.5433496269616671, + "grad_norm": 0.18195705115795135, + "learning_rate": 1.6174112256586484e-05, + "loss": 0.2797, + "step": 264 + }, + { + "epoch": 0.545407769488037, + "grad_norm": 0.1832309514284134, + "learning_rate": 1.6151202749140896e-05, + "loss": 0.2885, + "step": 265 + }, + { + "epoch": 0.547465912014407, + "grad_norm": 0.1773810088634491, + "learning_rate": 1.6128293241695305e-05, + "loss": 0.2682, + "step": 266 + }, + { + "epoch": 0.5495240545407769, + "grad_norm": 0.16989603638648987, + "learning_rate": 1.6105383734249714e-05, + "loss": 0.2821, + "step": 267 + }, + { + "epoch": 0.551582197067147, + "grad_norm": 0.17835170030593872, + "learning_rate": 1.6082474226804127e-05, + "loss": 0.2774, + "step": 268 + }, + { + "epoch": 0.5536403395935169, + "grad_norm": 0.1777082234621048, + "learning_rate": 1.6059564719358535e-05, + "loss": 0.2726, + "step": 269 + }, + { + "epoch": 0.5556984821198868, + "grad_norm": 0.18766450881958008, + "learning_rate": 1.6036655211912944e-05, + "loss": 0.2879, + "step": 270 + }, + { + "epoch": 0.5577566246462567, + "grad_norm": 0.1868186593055725, + "learning_rate": 1.6013745704467357e-05, + "loss": 0.2808, + "step": 271 + }, + { + "epoch": 0.5598147671726267, + "grad_norm": 0.16695882380008698, + "learning_rate": 1.5990836197021766e-05, + "loss": 0.2668, + "step": 272 + }, + { + "epoch": 0.5618729096989966, + "grad_norm": 0.17224495112895966, + "learning_rate": 1.5967926689576178e-05, + "loss": 0.2682, + "step": 273 + }, + { + "epoch": 0.5639310522253667, + "grad_norm": 0.20116423070430756, + "learning_rate": 1.5945017182130587e-05, + "loss": 0.276, + "step": 274 + }, + { + "epoch": 0.5659891947517366, + "grad_norm": 0.19478343427181244, + "learning_rate": 1.5922107674684996e-05, + "loss": 0.2854, + "step": 275 + }, + { + "epoch": 0.5680473372781065, + "grad_norm": 0.20242950320243835, + "learning_rate": 1.5899198167239405e-05, + "loss": 0.2854, + "step": 276 + }, + { + "epoch": 0.5701054798044765, + "grad_norm": 0.19146093726158142, + "learning_rate": 1.5876288659793813e-05, + "loss": 0.2817, + "step": 277 + }, + { + "epoch": 0.5721636223308464, + "grad_norm": 0.1804896742105484, + "learning_rate": 1.5853379152348226e-05, + "loss": 0.2714, + "step": 278 + }, + { + "epoch": 0.5742217648572163, + "grad_norm": 0.19315646588802338, + "learning_rate": 1.5830469644902635e-05, + "loss": 0.2703, + "step": 279 + }, + { + "epoch": 0.5762799073835864, + "grad_norm": 0.1910266876220703, + "learning_rate": 1.5807560137457047e-05, + "loss": 0.2728, + "step": 280 + }, + { + "epoch": 0.5783380499099563, + "grad_norm": 0.20330773293972015, + "learning_rate": 1.5784650630011456e-05, + "loss": 0.2717, + "step": 281 + }, + { + "epoch": 0.5803961924363262, + "grad_norm": 0.19080683588981628, + "learning_rate": 1.5761741122565865e-05, + "loss": 0.2679, + "step": 282 + }, + { + "epoch": 0.5824543349626962, + "grad_norm": 0.18052135407924652, + "learning_rate": 1.5738831615120277e-05, + "loss": 0.2815, + "step": 283 + }, + { + "epoch": 0.5845124774890661, + "grad_norm": 0.1998361051082611, + "learning_rate": 1.5715922107674686e-05, + "loss": 0.2888, + "step": 284 + }, + { + "epoch": 0.586570620015436, + "grad_norm": 0.1978764683008194, + "learning_rate": 1.5693012600229095e-05, + "loss": 0.2926, + "step": 285 + }, + { + "epoch": 0.5886287625418061, + "grad_norm": 0.17189203202724457, + "learning_rate": 1.5670103092783507e-05, + "loss": 0.2674, + "step": 286 + }, + { + "epoch": 0.590686905068176, + "grad_norm": 0.1937166303396225, + "learning_rate": 1.5647193585337916e-05, + "loss": 0.2838, + "step": 287 + }, + { + "epoch": 0.5927450475945459, + "grad_norm": 0.18978627026081085, + "learning_rate": 1.5624284077892328e-05, + "loss": 0.273, + "step": 288 + }, + { + "epoch": 0.5948031901209159, + "grad_norm": 0.17718705534934998, + "learning_rate": 1.5601374570446737e-05, + "loss": 0.2842, + "step": 289 + }, + { + "epoch": 0.5968613326472858, + "grad_norm": 0.1912536770105362, + "learning_rate": 1.5578465063001146e-05, + "loss": 0.2736, + "step": 290 + }, + { + "epoch": 0.5989194751736557, + "grad_norm": 0.18104907870292664, + "learning_rate": 1.555555555555556e-05, + "loss": 0.274, + "step": 291 + }, + { + "epoch": 0.6009776177000258, + "grad_norm": 0.1620381772518158, + "learning_rate": 1.5532646048109967e-05, + "loss": 0.2663, + "step": 292 + }, + { + "epoch": 0.6030357602263957, + "grad_norm": 0.17973916232585907, + "learning_rate": 1.5509736540664376e-05, + "loss": 0.2791, + "step": 293 + }, + { + "epoch": 0.6050939027527656, + "grad_norm": 0.16821186244487762, + "learning_rate": 1.548682703321879e-05, + "loss": 0.2787, + "step": 294 + }, + { + "epoch": 0.6071520452791356, + "grad_norm": 0.18426693975925446, + "learning_rate": 1.5463917525773197e-05, + "loss": 0.2886, + "step": 295 + }, + { + "epoch": 0.6092101878055055, + "grad_norm": 0.19796033203601837, + "learning_rate": 1.5441008018327606e-05, + "loss": 0.268, + "step": 296 + }, + { + "epoch": 0.6112683303318754, + "grad_norm": 0.1971343755722046, + "learning_rate": 1.541809851088202e-05, + "loss": 0.2761, + "step": 297 + }, + { + "epoch": 0.6133264728582455, + "grad_norm": 0.17458567023277283, + "learning_rate": 1.5395189003436427e-05, + "loss": 0.2831, + "step": 298 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.17610400915145874, + "learning_rate": 1.537227949599084e-05, + "loss": 0.2691, + "step": 299 + }, + { + "epoch": 0.6174427579109854, + "grad_norm": 0.1929042488336563, + "learning_rate": 1.534936998854525e-05, + "loss": 0.2847, + "step": 300 + }, + { + "epoch": 0.6174427579109854, + "eval_loss": 0.2959522604942322, + "eval_runtime": 2428.6339, + "eval_samples_per_second": 3.201, + "eval_steps_per_second": 0.8, + "step": 300 + }, + { + "epoch": 0.6195009004373553, + "grad_norm": 0.19430233538150787, + "learning_rate": 1.5326460481099657e-05, + "loss": 0.279, + "step": 301 + }, + { + "epoch": 0.6215590429637252, + "grad_norm": 0.18542642891407013, + "learning_rate": 1.5303550973654066e-05, + "loss": 0.2695, + "step": 302 + }, + { + "epoch": 0.6236171854900951, + "grad_norm": 0.1850169450044632, + "learning_rate": 1.5280641466208475e-05, + "loss": 0.2847, + "step": 303 + }, + { + "epoch": 0.6256753280164652, + "grad_norm": 0.18449267745018005, + "learning_rate": 1.5257731958762888e-05, + "loss": 0.2804, + "step": 304 + }, + { + "epoch": 0.6277334705428351, + "grad_norm": 0.18608458340168, + "learning_rate": 1.5234822451317296e-05, + "loss": 0.2792, + "step": 305 + }, + { + "epoch": 0.6297916130692051, + "grad_norm": 0.21136076748371124, + "learning_rate": 1.5211912943871707e-05, + "loss": 0.2829, + "step": 306 + }, + { + "epoch": 0.631849755595575, + "grad_norm": 0.19672206044197083, + "learning_rate": 1.5189003436426118e-05, + "loss": 0.2854, + "step": 307 + }, + { + "epoch": 0.6339078981219449, + "grad_norm": 0.1834034025669098, + "learning_rate": 1.5166093928980528e-05, + "loss": 0.2775, + "step": 308 + }, + { + "epoch": 0.6359660406483149, + "grad_norm": 0.18414819240570068, + "learning_rate": 1.5143184421534937e-05, + "loss": 0.2794, + "step": 309 + }, + { + "epoch": 0.6380241831746849, + "grad_norm": 0.1890152245759964, + "learning_rate": 1.5120274914089348e-05, + "loss": 0.2718, + "step": 310 + }, + { + "epoch": 0.6400823257010548, + "grad_norm": 0.18923887610435486, + "learning_rate": 1.5097365406643758e-05, + "loss": 0.2795, + "step": 311 + }, + { + "epoch": 0.6421404682274248, + "grad_norm": 0.20047079026699066, + "learning_rate": 1.5074455899198169e-05, + "loss": 0.2811, + "step": 312 + }, + { + "epoch": 0.6441986107537947, + "grad_norm": 0.1910201609134674, + "learning_rate": 1.5051546391752578e-05, + "loss": 0.2732, + "step": 313 + }, + { + "epoch": 0.6462567532801646, + "grad_norm": 0.2021956443786621, + "learning_rate": 1.5028636884306988e-05, + "loss": 0.2806, + "step": 314 + }, + { + "epoch": 0.6483148958065346, + "grad_norm": 0.18957914412021637, + "learning_rate": 1.5005727376861399e-05, + "loss": 0.2681, + "step": 315 + }, + { + "epoch": 0.6503730383329046, + "grad_norm": 0.19858811795711517, + "learning_rate": 1.498281786941581e-05, + "loss": 0.2805, + "step": 316 + }, + { + "epoch": 0.6524311808592745, + "grad_norm": 0.1731935292482376, + "learning_rate": 1.4959908361970218e-05, + "loss": 0.2646, + "step": 317 + }, + { + "epoch": 0.6544893233856445, + "grad_norm": 0.19619058072566986, + "learning_rate": 1.4936998854524629e-05, + "loss": 0.2965, + "step": 318 + }, + { + "epoch": 0.6565474659120144, + "grad_norm": 0.18745696544647217, + "learning_rate": 1.491408934707904e-05, + "loss": 0.2766, + "step": 319 + }, + { + "epoch": 0.6586056084383843, + "grad_norm": 0.18006449937820435, + "learning_rate": 1.489117983963345e-05, + "loss": 0.2788, + "step": 320 + }, + { + "epoch": 0.6606637509647543, + "grad_norm": 0.17593689262866974, + "learning_rate": 1.486827033218786e-05, + "loss": 0.2813, + "step": 321 + }, + { + "epoch": 0.6627218934911243, + "grad_norm": 0.18695640563964844, + "learning_rate": 1.484536082474227e-05, + "loss": 0.281, + "step": 322 + }, + { + "epoch": 0.6647800360174942, + "grad_norm": 0.17909488081932068, + "learning_rate": 1.482245131729668e-05, + "loss": 0.2814, + "step": 323 + }, + { + "epoch": 0.6668381785438642, + "grad_norm": 0.19074076414108276, + "learning_rate": 1.4799541809851091e-05, + "loss": 0.2721, + "step": 324 + }, + { + "epoch": 0.6688963210702341, + "grad_norm": 0.19175754487514496, + "learning_rate": 1.47766323024055e-05, + "loss": 0.2754, + "step": 325 + }, + { + "epoch": 0.670954463596604, + "grad_norm": 0.18646575510501862, + "learning_rate": 1.475372279495991e-05, + "loss": 0.2678, + "step": 326 + }, + { + "epoch": 0.673012606122974, + "grad_norm": 0.18553243577480316, + "learning_rate": 1.4730813287514321e-05, + "loss": 0.281, + "step": 327 + }, + { + "epoch": 0.675070748649344, + "grad_norm": 0.17120976746082306, + "learning_rate": 1.470790378006873e-05, + "loss": 0.2691, + "step": 328 + }, + { + "epoch": 0.677128891175714, + "grad_norm": 0.19170524179935455, + "learning_rate": 1.4684994272623139e-05, + "loss": 0.2685, + "step": 329 + }, + { + "epoch": 0.6791870337020839, + "grad_norm": 0.1851339191198349, + "learning_rate": 1.466208476517755e-05, + "loss": 0.266, + "step": 330 + }, + { + "epoch": 0.6812451762284538, + "grad_norm": 0.1678062081336975, + "learning_rate": 1.4639175257731958e-05, + "loss": 0.2609, + "step": 331 + }, + { + "epoch": 0.6833033187548238, + "grad_norm": 0.17913252115249634, + "learning_rate": 1.4616265750286369e-05, + "loss": 0.2716, + "step": 332 + }, + { + "epoch": 0.6853614612811937, + "grad_norm": 0.1859239637851715, + "learning_rate": 1.459335624284078e-05, + "loss": 0.2712, + "step": 333 + }, + { + "epoch": 0.6874196038075637, + "grad_norm": 0.18390226364135742, + "learning_rate": 1.457044673539519e-05, + "loss": 0.2827, + "step": 334 + }, + { + "epoch": 0.6894777463339337, + "grad_norm": 0.18520398437976837, + "learning_rate": 1.4547537227949599e-05, + "loss": 0.2721, + "step": 335 + }, + { + "epoch": 0.6915358888603036, + "grad_norm": 0.18416717648506165, + "learning_rate": 1.452462772050401e-05, + "loss": 0.2683, + "step": 336 + }, + { + "epoch": 0.6935940313866735, + "grad_norm": 0.18727894127368927, + "learning_rate": 1.450171821305842e-05, + "loss": 0.2733, + "step": 337 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.18597093224525452, + "learning_rate": 1.447880870561283e-05, + "loss": 0.2708, + "step": 338 + }, + { + "epoch": 0.6977103164394134, + "grad_norm": 0.1786068081855774, + "learning_rate": 1.445589919816724e-05, + "loss": 0.2667, + "step": 339 + }, + { + "epoch": 0.6997684589657834, + "grad_norm": 0.17466600239276886, + "learning_rate": 1.443298969072165e-05, + "loss": 0.2786, + "step": 340 + }, + { + "epoch": 0.7018266014921534, + "grad_norm": 0.185857355594635, + "learning_rate": 1.4410080183276061e-05, + "loss": 0.2759, + "step": 341 + }, + { + "epoch": 0.7038847440185233, + "grad_norm": 0.2004527747631073, + "learning_rate": 1.4387170675830471e-05, + "loss": 0.2847, + "step": 342 + }, + { + "epoch": 0.7059428865448932, + "grad_norm": 0.18774060904979706, + "learning_rate": 1.436426116838488e-05, + "loss": 0.2766, + "step": 343 + }, + { + "epoch": 0.7080010290712632, + "grad_norm": 0.1840328425168991, + "learning_rate": 1.4341351660939291e-05, + "loss": 0.2722, + "step": 344 + }, + { + "epoch": 0.7100591715976331, + "grad_norm": 0.19089624285697937, + "learning_rate": 1.4318442153493702e-05, + "loss": 0.2779, + "step": 345 + }, + { + "epoch": 0.7121173141240031, + "grad_norm": 0.1848018616437912, + "learning_rate": 1.4295532646048112e-05, + "loss": 0.2739, + "step": 346 + }, + { + "epoch": 0.7141754566503731, + "grad_norm": 0.18844038248062134, + "learning_rate": 1.4272623138602521e-05, + "loss": 0.27, + "step": 347 + }, + { + "epoch": 0.716233599176743, + "grad_norm": 0.19289302825927734, + "learning_rate": 1.4249713631156932e-05, + "loss": 0.2743, + "step": 348 + }, + { + "epoch": 0.7182917417031129, + "grad_norm": 0.18738920986652374, + "learning_rate": 1.4226804123711342e-05, + "loss": 0.2657, + "step": 349 + }, + { + "epoch": 0.7203498842294829, + "grad_norm": 0.1925181746482849, + "learning_rate": 1.4203894616265753e-05, + "loss": 0.2637, + "step": 350 + }, + { + "epoch": 0.7224080267558528, + "grad_norm": 0.19114750623703003, + "learning_rate": 1.4180985108820162e-05, + "loss": 0.2758, + "step": 351 + }, + { + "epoch": 0.7244661692822228, + "grad_norm": 0.18310120701789856, + "learning_rate": 1.4158075601374572e-05, + "loss": 0.2777, + "step": 352 + }, + { + "epoch": 0.7265243118085928, + "grad_norm": 0.2045605331659317, + "learning_rate": 1.4135166093928983e-05, + "loss": 0.2653, + "step": 353 + }, + { + "epoch": 0.7285824543349627, + "grad_norm": 0.1856454759836197, + "learning_rate": 1.4112256586483393e-05, + "loss": 0.267, + "step": 354 + }, + { + "epoch": 0.7306405968613326, + "grad_norm": 0.1855366826057434, + "learning_rate": 1.4089347079037802e-05, + "loss": 0.2805, + "step": 355 + }, + { + "epoch": 0.7326987393877026, + "grad_norm": 0.17913414537906647, + "learning_rate": 1.4066437571592213e-05, + "loss": 0.2755, + "step": 356 + }, + { + "epoch": 0.7347568819140725, + "grad_norm": 0.2057684361934662, + "learning_rate": 1.404352806414662e-05, + "loss": 0.2668, + "step": 357 + }, + { + "epoch": 0.7368150244404424, + "grad_norm": 0.190156951546669, + "learning_rate": 1.402061855670103e-05, + "loss": 0.2778, + "step": 358 + }, + { + "epoch": 0.7388731669668125, + "grad_norm": 0.19387219846248627, + "learning_rate": 1.3997709049255441e-05, + "loss": 0.2785, + "step": 359 + }, + { + "epoch": 0.7409313094931824, + "grad_norm": 0.1933836042881012, + "learning_rate": 1.3974799541809852e-05, + "loss": 0.2661, + "step": 360 + }, + { + "epoch": 0.7429894520195524, + "grad_norm": 0.19618812203407288, + "learning_rate": 1.3951890034364261e-05, + "loss": 0.2622, + "step": 361 + }, + { + "epoch": 0.7450475945459223, + "grad_norm": 0.18786942958831787, + "learning_rate": 1.3928980526918671e-05, + "loss": 0.2695, + "step": 362 + }, + { + "epoch": 0.7471057370722922, + "grad_norm": 0.19361330568790436, + "learning_rate": 1.3906071019473082e-05, + "loss": 0.2869, + "step": 363 + }, + { + "epoch": 0.7491638795986622, + "grad_norm": 0.19813291728496552, + "learning_rate": 1.3883161512027493e-05, + "loss": 0.2753, + "step": 364 + }, + { + "epoch": 0.7512220221250322, + "grad_norm": 0.1891734004020691, + "learning_rate": 1.3860252004581902e-05, + "loss": 0.2694, + "step": 365 + }, + { + "epoch": 0.7532801646514021, + "grad_norm": 0.18902742862701416, + "learning_rate": 1.3837342497136312e-05, + "loss": 0.2675, + "step": 366 + }, + { + "epoch": 0.7553383071777721, + "grad_norm": 0.19838480651378632, + "learning_rate": 1.3814432989690723e-05, + "loss": 0.2721, + "step": 367 + }, + { + "epoch": 0.757396449704142, + "grad_norm": 0.20880939066410065, + "learning_rate": 1.3791523482245133e-05, + "loss": 0.2641, + "step": 368 + }, + { + "epoch": 0.7594545922305119, + "grad_norm": 0.20068003237247467, + "learning_rate": 1.3768613974799542e-05, + "loss": 0.2945, + "step": 369 + }, + { + "epoch": 0.7615127347568819, + "grad_norm": 0.19780132174491882, + "learning_rate": 1.3745704467353953e-05, + "loss": 0.2687, + "step": 370 + }, + { + "epoch": 0.7635708772832519, + "grad_norm": 0.19194689393043518, + "learning_rate": 1.3722794959908363e-05, + "loss": 0.2731, + "step": 371 + }, + { + "epoch": 0.7656290198096218, + "grad_norm": 0.19504573941230774, + "learning_rate": 1.3699885452462774e-05, + "loss": 0.2551, + "step": 372 + }, + { + "epoch": 0.7676871623359918, + "grad_norm": 0.18304413557052612, + "learning_rate": 1.3676975945017183e-05, + "loss": 0.2692, + "step": 373 + }, + { + "epoch": 0.7697453048623617, + "grad_norm": 0.2051483392715454, + "learning_rate": 1.3654066437571593e-05, + "loss": 0.2791, + "step": 374 + }, + { + "epoch": 0.7718034473887316, + "grad_norm": 0.18748973309993744, + "learning_rate": 1.3631156930126004e-05, + "loss": 0.2671, + "step": 375 + }, + { + "epoch": 0.7738615899151016, + "grad_norm": 0.19167177379131317, + "learning_rate": 1.3608247422680415e-05, + "loss": 0.2766, + "step": 376 + }, + { + "epoch": 0.7759197324414716, + "grad_norm": 0.17931750416755676, + "learning_rate": 1.3585337915234824e-05, + "loss": 0.2748, + "step": 377 + }, + { + "epoch": 0.7779778749678415, + "grad_norm": 0.19437509775161743, + "learning_rate": 1.3562428407789234e-05, + "loss": 0.2667, + "step": 378 + }, + { + "epoch": 0.7800360174942115, + "grad_norm": 0.19813868403434753, + "learning_rate": 1.3539518900343645e-05, + "loss": 0.2771, + "step": 379 + }, + { + "epoch": 0.7820941600205814, + "grad_norm": 0.19205260276794434, + "learning_rate": 1.3516609392898055e-05, + "loss": 0.2703, + "step": 380 + }, + { + "epoch": 0.7841523025469513, + "grad_norm": 0.19039763510227203, + "learning_rate": 1.3493699885452464e-05, + "loss": 0.264, + "step": 381 + }, + { + "epoch": 0.7862104450733213, + "grad_norm": 0.18269500136375427, + "learning_rate": 1.3470790378006875e-05, + "loss": 0.2653, + "step": 382 + }, + { + "epoch": 0.7882685875996913, + "grad_norm": 0.1922067403793335, + "learning_rate": 1.3447880870561285e-05, + "loss": 0.2754, + "step": 383 + }, + { + "epoch": 0.7903267301260612, + "grad_norm": 0.19615666568279266, + "learning_rate": 1.3424971363115693e-05, + "loss": 0.2811, + "step": 384 + }, + { + "epoch": 0.7923848726524312, + "grad_norm": 0.19037973880767822, + "learning_rate": 1.3402061855670103e-05, + "loss": 0.2673, + "step": 385 + }, + { + "epoch": 0.7944430151788011, + "grad_norm": 0.191124826669693, + "learning_rate": 1.3379152348224514e-05, + "loss": 0.2683, + "step": 386 + }, + { + "epoch": 0.796501157705171, + "grad_norm": 0.18429923057556152, + "learning_rate": 1.3356242840778923e-05, + "loss": 0.2698, + "step": 387 + }, + { + "epoch": 0.798559300231541, + "grad_norm": 0.1839045137166977, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.2895, + "step": 388 + }, + { + "epoch": 0.800617442757911, + "grad_norm": 0.1944131702184677, + "learning_rate": 1.3310423825887744e-05, + "loss": 0.2641, + "step": 389 + }, + { + "epoch": 0.802675585284281, + "grad_norm": 0.20407740771770477, + "learning_rate": 1.3287514318442154e-05, + "loss": 0.2743, + "step": 390 + }, + { + "epoch": 0.8047337278106509, + "grad_norm": 0.1814037561416626, + "learning_rate": 1.3264604810996563e-05, + "loss": 0.2672, + "step": 391 + }, + { + "epoch": 0.8067918703370208, + "grad_norm": 0.1886950582265854, + "learning_rate": 1.3241695303550974e-05, + "loss": 0.2725, + "step": 392 + }, + { + "epoch": 0.8088500128633908, + "grad_norm": 0.19429941475391388, + "learning_rate": 1.3218785796105385e-05, + "loss": 0.2669, + "step": 393 + }, + { + "epoch": 0.8109081553897607, + "grad_norm": 0.19143058359622955, + "learning_rate": 1.3195876288659795e-05, + "loss": 0.2659, + "step": 394 + }, + { + "epoch": 0.8129662979161307, + "grad_norm": 0.2213468849658966, + "learning_rate": 1.3172966781214204e-05, + "loss": 0.2764, + "step": 395 + }, + { + "epoch": 0.8150244404425007, + "grad_norm": 0.2040800005197525, + "learning_rate": 1.3150057273768615e-05, + "loss": 0.2783, + "step": 396 + }, + { + "epoch": 0.8170825829688706, + "grad_norm": 0.1948375254869461, + "learning_rate": 1.3127147766323025e-05, + "loss": 0.2689, + "step": 397 + }, + { + "epoch": 0.8191407254952405, + "grad_norm": 0.1915021538734436, + "learning_rate": 1.3104238258877436e-05, + "loss": 0.2808, + "step": 398 + }, + { + "epoch": 0.8211988680216105, + "grad_norm": 0.19760248064994812, + "learning_rate": 1.3081328751431845e-05, + "loss": 0.2712, + "step": 399 + }, + { + "epoch": 0.8232570105479804, + "grad_norm": 0.2082677185535431, + "learning_rate": 1.3058419243986255e-05, + "loss": 0.2707, + "step": 400 + }, + { + "epoch": 0.8232570105479804, + "eval_loss": 0.28778496384620667, + "eval_runtime": 2427.9096, + "eval_samples_per_second": 3.202, + "eval_steps_per_second": 0.801, + "step": 400 + }, + { + "epoch": 0.8253151530743504, + "grad_norm": 0.19694332778453827, + "learning_rate": 1.3035509736540666e-05, + "loss": 0.2801, + "step": 401 + }, + { + "epoch": 0.8273732956007204, + "grad_norm": 0.19448824226856232, + "learning_rate": 1.3012600229095077e-05, + "loss": 0.2632, + "step": 402 + }, + { + "epoch": 0.8294314381270903, + "grad_norm": 0.18745476007461548, + "learning_rate": 1.2989690721649485e-05, + "loss": 0.2773, + "step": 403 + }, + { + "epoch": 0.8314895806534602, + "grad_norm": 0.19524575769901276, + "learning_rate": 1.2966781214203896e-05, + "loss": 0.2594, + "step": 404 + }, + { + "epoch": 0.8335477231798302, + "grad_norm": 0.19612252712249756, + "learning_rate": 1.2943871706758307e-05, + "loss": 0.271, + "step": 405 + }, + { + "epoch": 0.8356058657062001, + "grad_norm": 0.19964493811130524, + "learning_rate": 1.2920962199312717e-05, + "loss": 0.2615, + "step": 406 + }, + { + "epoch": 0.8376640082325701, + "grad_norm": 0.20115099847316742, + "learning_rate": 1.2898052691867126e-05, + "loss": 0.269, + "step": 407 + }, + { + "epoch": 0.8397221507589401, + "grad_norm": 0.18949687480926514, + "learning_rate": 1.2875143184421537e-05, + "loss": 0.2649, + "step": 408 + }, + { + "epoch": 0.84178029328531, + "grad_norm": 0.1931927353143692, + "learning_rate": 1.2852233676975947e-05, + "loss": 0.2611, + "step": 409 + }, + { + "epoch": 0.8438384358116799, + "grad_norm": 0.18723614513874054, + "learning_rate": 1.2829324169530358e-05, + "loss": 0.2699, + "step": 410 + }, + { + "epoch": 0.8458965783380499, + "grad_norm": 0.19405977427959442, + "learning_rate": 1.2806414662084765e-05, + "loss": 0.2691, + "step": 411 + }, + { + "epoch": 0.8479547208644198, + "grad_norm": 0.2021879404783249, + "learning_rate": 1.2783505154639176e-05, + "loss": 0.267, + "step": 412 + }, + { + "epoch": 0.8500128633907899, + "grad_norm": 0.20015574991703033, + "learning_rate": 1.2760595647193586e-05, + "loss": 0.2632, + "step": 413 + }, + { + "epoch": 0.8520710059171598, + "grad_norm": 0.19090059399604797, + "learning_rate": 1.2737686139747995e-05, + "loss": 0.2743, + "step": 414 + }, + { + "epoch": 0.8541291484435297, + "grad_norm": 0.1906920224428177, + "learning_rate": 1.2714776632302406e-05, + "loss": 0.2723, + "step": 415 + }, + { + "epoch": 0.8561872909698997, + "grad_norm": 0.19348129630088806, + "learning_rate": 1.2691867124856816e-05, + "loss": 0.2656, + "step": 416 + }, + { + "epoch": 0.8582454334962696, + "grad_norm": 0.18771213293075562, + "learning_rate": 1.2668957617411227e-05, + "loss": 0.2617, + "step": 417 + }, + { + "epoch": 0.8603035760226395, + "grad_norm": 0.2135135382413864, + "learning_rate": 1.2646048109965636e-05, + "loss": 0.2773, + "step": 418 + }, + { + "epoch": 0.8623617185490096, + "grad_norm": 0.19689443707466125, + "learning_rate": 1.2623138602520046e-05, + "loss": 0.2623, + "step": 419 + }, + { + "epoch": 0.8644198610753795, + "grad_norm": 0.18752440810203552, + "learning_rate": 1.2600229095074457e-05, + "loss": 0.2599, + "step": 420 + }, + { + "epoch": 0.8664780036017494, + "grad_norm": 0.19264395534992218, + "learning_rate": 1.2577319587628866e-05, + "loss": 0.2707, + "step": 421 + }, + { + "epoch": 0.8685361461281194, + "grad_norm": 0.19980797171592712, + "learning_rate": 1.2554410080183277e-05, + "loss": 0.2616, + "step": 422 + }, + { + "epoch": 0.8705942886544893, + "grad_norm": 0.22940242290496826, + "learning_rate": 1.2531500572737687e-05, + "loss": 0.2712, + "step": 423 + }, + { + "epoch": 0.8726524311808592, + "grad_norm": 0.18825359642505646, + "learning_rate": 1.2508591065292098e-05, + "loss": 0.2779, + "step": 424 + }, + { + "epoch": 0.8747105737072293, + "grad_norm": 0.21553562581539154, + "learning_rate": 1.2485681557846507e-05, + "loss": 0.2677, + "step": 425 + }, + { + "epoch": 0.8767687162335992, + "grad_norm": 0.2025568038225174, + "learning_rate": 1.2462772050400917e-05, + "loss": 0.2659, + "step": 426 + }, + { + "epoch": 0.8788268587599691, + "grad_norm": 0.19179950654506683, + "learning_rate": 1.2439862542955328e-05, + "loss": 0.2762, + "step": 427 + }, + { + "epoch": 0.8808850012863391, + "grad_norm": 0.20982210338115692, + "learning_rate": 1.2416953035509738e-05, + "loss": 0.2648, + "step": 428 + }, + { + "epoch": 0.882943143812709, + "grad_norm": 0.2084280252456665, + "learning_rate": 1.2394043528064147e-05, + "loss": 0.2806, + "step": 429 + }, + { + "epoch": 0.8850012863390789, + "grad_norm": 0.1993308663368225, + "learning_rate": 1.2371134020618558e-05, + "loss": 0.2673, + "step": 430 + }, + { + "epoch": 0.887059428865449, + "grad_norm": 0.1917535811662674, + "learning_rate": 1.2348224513172968e-05, + "loss": 0.2596, + "step": 431 + }, + { + "epoch": 0.8891175713918189, + "grad_norm": 0.18980742990970612, + "learning_rate": 1.2325315005727379e-05, + "loss": 0.2607, + "step": 432 + }, + { + "epoch": 0.8911757139181888, + "grad_norm": 0.21062685549259186, + "learning_rate": 1.2302405498281788e-05, + "loss": 0.2612, + "step": 433 + }, + { + "epoch": 0.8932338564445588, + "grad_norm": 0.20591405034065247, + "learning_rate": 1.2279495990836199e-05, + "loss": 0.2698, + "step": 434 + }, + { + "epoch": 0.8952919989709287, + "grad_norm": 0.2052398920059204, + "learning_rate": 1.2256586483390609e-05, + "loss": 0.2673, + "step": 435 + }, + { + "epoch": 0.8973501414972986, + "grad_norm": 0.19963452219963074, + "learning_rate": 1.223367697594502e-05, + "loss": 0.266, + "step": 436 + }, + { + "epoch": 0.8994082840236687, + "grad_norm": 0.1929163783788681, + "learning_rate": 1.2210767468499429e-05, + "loss": 0.2605, + "step": 437 + }, + { + "epoch": 0.9014664265500386, + "grad_norm": 0.19121681153774261, + "learning_rate": 1.218785796105384e-05, + "loss": 0.2642, + "step": 438 + }, + { + "epoch": 0.9035245690764085, + "grad_norm": 0.18931221961975098, + "learning_rate": 1.2164948453608248e-05, + "loss": 0.2653, + "step": 439 + }, + { + "epoch": 0.9055827116027785, + "grad_norm": 0.21359370648860931, + "learning_rate": 1.2142038946162657e-05, + "loss": 0.264, + "step": 440 + }, + { + "epoch": 0.9076408541291484, + "grad_norm": 0.1874193251132965, + "learning_rate": 1.2119129438717068e-05, + "loss": 0.2664, + "step": 441 + }, + { + "epoch": 0.9096989966555183, + "grad_norm": 0.19697226583957672, + "learning_rate": 1.2096219931271478e-05, + "loss": 0.2651, + "step": 442 + }, + { + "epoch": 0.9117571391818884, + "grad_norm": 0.20930957794189453, + "learning_rate": 1.2073310423825889e-05, + "loss": 0.2724, + "step": 443 + }, + { + "epoch": 0.9138152817082583, + "grad_norm": 0.19588977098464966, + "learning_rate": 1.2050400916380298e-05, + "loss": 0.2648, + "step": 444 + }, + { + "epoch": 0.9158734242346283, + "grad_norm": 0.19452017545700073, + "learning_rate": 1.2027491408934708e-05, + "loss": 0.2808, + "step": 445 + }, + { + "epoch": 0.9179315667609982, + "grad_norm": 0.19226408004760742, + "learning_rate": 1.2004581901489119e-05, + "loss": 0.2627, + "step": 446 + }, + { + "epoch": 0.9199897092873681, + "grad_norm": 0.18108274042606354, + "learning_rate": 1.198167239404353e-05, + "loss": 0.2693, + "step": 447 + }, + { + "epoch": 0.922047851813738, + "grad_norm": 0.19352363049983978, + "learning_rate": 1.1958762886597938e-05, + "loss": 0.2705, + "step": 448 + }, + { + "epoch": 0.9241059943401081, + "grad_norm": 0.18535122275352478, + "learning_rate": 1.1935853379152349e-05, + "loss": 0.2608, + "step": 449 + }, + { + "epoch": 0.926164136866478, + "grad_norm": 0.19209617376327515, + "learning_rate": 1.191294387170676e-05, + "loss": 0.2702, + "step": 450 + }, + { + "epoch": 0.928222279392848, + "grad_norm": 0.1866796910762787, + "learning_rate": 1.189003436426117e-05, + "loss": 0.264, + "step": 451 + }, + { + "epoch": 0.9302804219192179, + "grad_norm": 0.21708665788173676, + "learning_rate": 1.1867124856815579e-05, + "loss": 0.2693, + "step": 452 + }, + { + "epoch": 0.9323385644455878, + "grad_norm": 0.19297796487808228, + "learning_rate": 1.184421534936999e-05, + "loss": 0.2745, + "step": 453 + }, + { + "epoch": 0.9343967069719578, + "grad_norm": 0.19070400297641754, + "learning_rate": 1.18213058419244e-05, + "loss": 0.265, + "step": 454 + }, + { + "epoch": 0.9364548494983278, + "grad_norm": 0.19821566343307495, + "learning_rate": 1.1798396334478809e-05, + "loss": 0.2674, + "step": 455 + }, + { + "epoch": 0.9385129920246977, + "grad_norm": 0.2032192200422287, + "learning_rate": 1.177548682703322e-05, + "loss": 0.276, + "step": 456 + }, + { + "epoch": 0.9405711345510677, + "grad_norm": 0.19127750396728516, + "learning_rate": 1.175257731958763e-05, + "loss": 0.2696, + "step": 457 + }, + { + "epoch": 0.9426292770774376, + "grad_norm": 0.19187286496162415, + "learning_rate": 1.1729667812142041e-05, + "loss": 0.2601, + "step": 458 + }, + { + "epoch": 0.9446874196038075, + "grad_norm": 0.20871371030807495, + "learning_rate": 1.170675830469645e-05, + "loss": 0.2687, + "step": 459 + }, + { + "epoch": 0.9467455621301775, + "grad_norm": 0.19228306412696838, + "learning_rate": 1.168384879725086e-05, + "loss": 0.2633, + "step": 460 + }, + { + "epoch": 0.9488037046565475, + "grad_norm": 0.19025444984436035, + "learning_rate": 1.1660939289805271e-05, + "loss": 0.2721, + "step": 461 + }, + { + "epoch": 0.9508618471829174, + "grad_norm": 0.19476914405822754, + "learning_rate": 1.1638029782359682e-05, + "loss": 0.2662, + "step": 462 + }, + { + "epoch": 0.9529199897092874, + "grad_norm": 0.1991666853427887, + "learning_rate": 1.161512027491409e-05, + "loss": 0.269, + "step": 463 + }, + { + "epoch": 0.9549781322356573, + "grad_norm": 0.19385920464992523, + "learning_rate": 1.1592210767468501e-05, + "loss": 0.2647, + "step": 464 + }, + { + "epoch": 0.9570362747620272, + "grad_norm": 0.1911603957414627, + "learning_rate": 1.1569301260022912e-05, + "loss": 0.2679, + "step": 465 + }, + { + "epoch": 0.9590944172883972, + "grad_norm": 0.20373377203941345, + "learning_rate": 1.1546391752577319e-05, + "loss": 0.2694, + "step": 466 + }, + { + "epoch": 0.9611525598147672, + "grad_norm": 0.20550350844860077, + "learning_rate": 1.152348224513173e-05, + "loss": 0.2677, + "step": 467 + }, + { + "epoch": 0.9632107023411371, + "grad_norm": 0.2049354463815689, + "learning_rate": 1.150057273768614e-05, + "loss": 0.2752, + "step": 468 + }, + { + "epoch": 0.9652688448675071, + "grad_norm": 0.21691595017910004, + "learning_rate": 1.147766323024055e-05, + "loss": 0.2727, + "step": 469 + }, + { + "epoch": 0.967326987393877, + "grad_norm": 0.20727306604385376, + "learning_rate": 1.145475372279496e-05, + "loss": 0.2575, + "step": 470 + }, + { + "epoch": 0.969385129920247, + "grad_norm": 0.19166423380374908, + "learning_rate": 1.143184421534937e-05, + "loss": 0.2716, + "step": 471 + }, + { + "epoch": 0.9714432724466169, + "grad_norm": 0.18833886086940765, + "learning_rate": 1.140893470790378e-05, + "loss": 0.2651, + "step": 472 + }, + { + "epoch": 0.9735014149729869, + "grad_norm": 0.19680088758468628, + "learning_rate": 1.1386025200458191e-05, + "loss": 0.2621, + "step": 473 + }, + { + "epoch": 0.9755595574993569, + "grad_norm": 0.20966476202011108, + "learning_rate": 1.13631156930126e-05, + "loss": 0.2725, + "step": 474 + }, + { + "epoch": 0.9776177000257268, + "grad_norm": 0.1963450163602829, + "learning_rate": 1.134020618556701e-05, + "loss": 0.2569, + "step": 475 + }, + { + "epoch": 0.9796758425520967, + "grad_norm": 0.21289944648742676, + "learning_rate": 1.1317296678121421e-05, + "loss": 0.2622, + "step": 476 + }, + { + "epoch": 0.9817339850784667, + "grad_norm": 0.2103341966867447, + "learning_rate": 1.1294387170675832e-05, + "loss": 0.2803, + "step": 477 + }, + { + "epoch": 0.9837921276048366, + "grad_norm": 0.20202945172786713, + "learning_rate": 1.1271477663230241e-05, + "loss": 0.273, + "step": 478 + }, + { + "epoch": 0.9858502701312066, + "grad_norm": 0.18241006135940552, + "learning_rate": 1.1248568155784651e-05, + "loss": 0.2721, + "step": 479 + }, + { + "epoch": 0.9879084126575766, + "grad_norm": 0.19221259653568268, + "learning_rate": 1.1225658648339062e-05, + "loss": 0.2646, + "step": 480 + }, + { + "epoch": 0.9899665551839465, + "grad_norm": 0.19371837377548218, + "learning_rate": 1.1202749140893473e-05, + "loss": 0.2519, + "step": 481 + }, + { + "epoch": 0.9920246977103164, + "grad_norm": 0.1972094029188156, + "learning_rate": 1.1179839633447882e-05, + "loss": 0.2555, + "step": 482 + }, + { + "epoch": 0.9940828402366864, + "grad_norm": 0.19414126873016357, + "learning_rate": 1.1156930126002292e-05, + "loss": 0.2726, + "step": 483 + }, + { + "epoch": 0.9961409827630563, + "grad_norm": 0.18993492424488068, + "learning_rate": 1.1134020618556703e-05, + "loss": 0.2644, + "step": 484 + }, + { + "epoch": 0.9981991252894263, + "grad_norm": 0.19713927805423737, + "learning_rate": 1.1111111111111113e-05, + "loss": 0.2569, + "step": 485 + }, + { + "epoch": 1.00205814252637, + "grad_norm": 0.3423589766025543, + "learning_rate": 1.1088201603665522e-05, + "loss": 0.5285, + "step": 486 + }, + { + "epoch": 1.0041162850527399, + "grad_norm": 0.1901763528585434, + "learning_rate": 1.1065292096219933e-05, + "loss": 0.2621, + "step": 487 + }, + { + "epoch": 1.0061744275791098, + "grad_norm": 0.20508776605129242, + "learning_rate": 1.1042382588774343e-05, + "loss": 0.2665, + "step": 488 + }, + { + "epoch": 1.0082325701054797, + "grad_norm": 0.20188146829605103, + "learning_rate": 1.1019473081328752e-05, + "loss": 0.2547, + "step": 489 + }, + { + "epoch": 1.0102907126318497, + "grad_norm": 0.20245613157749176, + "learning_rate": 1.0996563573883163e-05, + "loss": 0.2657, + "step": 490 + }, + { + "epoch": 1.0123488551582196, + "grad_norm": 0.19711382687091827, + "learning_rate": 1.0973654066437574e-05, + "loss": 0.2597, + "step": 491 + }, + { + "epoch": 1.0144069976845898, + "grad_norm": 0.21538953483104706, + "learning_rate": 1.0950744558991984e-05, + "loss": 0.2727, + "step": 492 + }, + { + "epoch": 1.0164651402109597, + "grad_norm": 0.20296984910964966, + "learning_rate": 1.0927835051546391e-05, + "loss": 0.2634, + "step": 493 + }, + { + "epoch": 1.0185232827373296, + "grad_norm": 0.20134592056274414, + "learning_rate": 1.0904925544100802e-05, + "loss": 0.2596, + "step": 494 + }, + { + "epoch": 1.0205814252636995, + "grad_norm": 0.200101837515831, + "learning_rate": 1.0882016036655212e-05, + "loss": 0.2575, + "step": 495 + }, + { + "epoch": 1.0226395677900695, + "grad_norm": 0.19144928455352783, + "learning_rate": 1.0859106529209621e-05, + "loss": 0.263, + "step": 496 + }, + { + "epoch": 1.0246977103164394, + "grad_norm": 0.19832482933998108, + "learning_rate": 1.0836197021764032e-05, + "loss": 0.2656, + "step": 497 + }, + { + "epoch": 1.0267558528428093, + "grad_norm": 0.20965202152729034, + "learning_rate": 1.0813287514318443e-05, + "loss": 0.2611, + "step": 498 + }, + { + "epoch": 1.0288139953691793, + "grad_norm": 0.1974337100982666, + "learning_rate": 1.0790378006872853e-05, + "loss": 0.2667, + "step": 499 + }, + { + "epoch": 1.0308721378955492, + "grad_norm": 0.20611713826656342, + "learning_rate": 1.0767468499427262e-05, + "loss": 0.2674, + "step": 500 + }, + { + "epoch": 1.0308721378955492, + "eval_loss": 0.2836935222148895, + "eval_runtime": 2423.44, + "eval_samples_per_second": 3.208, + "eval_steps_per_second": 0.802, + "step": 500 + } + ], + "logging_steps": 1, + "max_steps": 970, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.6525097664804864e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/training_args.bin b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5999c7ee9dd10ee9076d748e4757533e635fa832 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee55a11f5a306eb7c39b536fdfe2459bc279e468da50f6adda478c4deffcb812 +size 5688 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/README.md b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0d77d70fdc5c829c8889cb85828736b7eb9714 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/codegemma-7b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/adapter_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e841602c6a59fc7b085ac647af4d4c312445d261 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/codegemma-7b-it", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "q_proj", + "down_proj", + "gate_proj", + "k_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/adapter_model.safetensors b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..49efb1b57c14c681c14a2879d16c45d0a76deaee --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a4a4eb27a524b992a8e59aea93b00a2ffc7e7415d2d6011c4071f010d9389f6 +size 800116456 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/optimizer.pt b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c911d1de23ca89d408e756b97329a785139f5342 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75b79cc1e894c58b1cf7627b10b559ab9a7003c61a3fb64ea34eb22fa3eb2feb +size 406743860 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/rng_state.pth b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1c6c70515a94cc54e7ff111e913729f07d147ca7 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df8b592b7c7055ceb0aa14cb6aa6d9165595cd462a195f29f85bbd715272a8ee +size 14244 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/scheduler.pt b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..89c1e781edc457b7c1431bed665151e2eba2ddd1 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9573cecdf609ea52af7d33d9f347cb04e9ebd442806c98544995035a14de89dd +size 1064 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/special_tokens_map.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/tokenizer.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..45a5e23f54141c5f4f97a8d58f3ffadc28e287ba --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d964a2c8346d40f95791533eae48730d5f163c2e65fd16333560fd3e661df318 +size 34362915 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/tokenizer.model b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..71a98ce40269d847e58957e1e070d9ae8eb184af --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:583f2ebd2a1936009b7da991ea255504db68c7a9713a78673d1335a87098966c +size 4241023 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/tokenizer_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9b9b1b4acdd4afcedae39d1cf6f0bc7ef7d9910f --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/tokenizer_config.json @@ -0,0 +1,2011 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "<|file_separator|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "left", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/trainer_state.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..afc8b89bce2e2a3d43cc6d769990ba0482051301 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/trainer_state.json @@ -0,0 +1,4282 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.2366863905325443, + "eval_steps": 100, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0020581425263699513, + "grad_norm": 11.994463920593262, + "learning_rate": 2.061855670103093e-07, + "loss": 2.91, + "step": 1 + }, + { + "epoch": 0.004116285052739903, + "grad_norm": 11.769092559814453, + "learning_rate": 4.123711340206186e-07, + "loss": 2.8686, + "step": 2 + }, + { + "epoch": 0.0061744275791098535, + "grad_norm": 13.05551815032959, + "learning_rate": 6.185567010309279e-07, + "loss": 3.0286, + "step": 3 + }, + { + "epoch": 0.008232570105479805, + "grad_norm": 12.334521293640137, + "learning_rate": 8.247422680412372e-07, + "loss": 2.904, + "step": 4 + }, + { + "epoch": 0.010290712631849755, + "grad_norm": 12.075353622436523, + "learning_rate": 1.0309278350515464e-06, + "loss": 2.8991, + "step": 5 + }, + { + "epoch": 0.012348855158219707, + "grad_norm": 11.86032485961914, + "learning_rate": 1.2371134020618557e-06, + "loss": 3.0007, + "step": 6 + }, + { + "epoch": 0.014406997684589657, + "grad_norm": 10.10457992553711, + "learning_rate": 1.4432989690721649e-06, + "loss": 2.8493, + "step": 7 + }, + { + "epoch": 0.01646514021095961, + "grad_norm": 8.56408405303955, + "learning_rate": 1.6494845360824744e-06, + "loss": 2.9573, + "step": 8 + }, + { + "epoch": 0.01852328273732956, + "grad_norm": 6.307392120361328, + "learning_rate": 1.8556701030927837e-06, + "loss": 2.9507, + "step": 9 + }, + { + "epoch": 0.02058142526369951, + "grad_norm": 4.276430130004883, + "learning_rate": 2.061855670103093e-06, + "loss": 2.8988, + "step": 10 + }, + { + "epoch": 0.022639567790069464, + "grad_norm": 2.5912015438079834, + "learning_rate": 2.268041237113402e-06, + "loss": 2.9926, + "step": 11 + }, + { + "epoch": 0.024697710316439414, + "grad_norm": 2.018446207046509, + "learning_rate": 2.4742268041237115e-06, + "loss": 2.9874, + "step": 12 + }, + { + "epoch": 0.026755852842809364, + "grad_norm": 1.8558588027954102, + "learning_rate": 2.680412371134021e-06, + "loss": 2.8608, + "step": 13 + }, + { + "epoch": 0.028813995369179314, + "grad_norm": 1.9658265113830566, + "learning_rate": 2.8865979381443297e-06, + "loss": 2.8596, + "step": 14 + }, + { + "epoch": 0.030872137895549268, + "grad_norm": 1.872044563293457, + "learning_rate": 3.0927835051546395e-06, + "loss": 2.8836, + "step": 15 + }, + { + "epoch": 0.03293028042191922, + "grad_norm": 1.8884096145629883, + "learning_rate": 3.298969072164949e-06, + "loss": 2.9383, + "step": 16 + }, + { + "epoch": 0.03498842294828917, + "grad_norm": 1.8795744180679321, + "learning_rate": 3.5051546391752577e-06, + "loss": 2.883, + "step": 17 + }, + { + "epoch": 0.03704656547465912, + "grad_norm": 1.783678412437439, + "learning_rate": 3.7113402061855674e-06, + "loss": 2.8019, + "step": 18 + }, + { + "epoch": 0.039104708001029075, + "grad_norm": 1.820617914199829, + "learning_rate": 3.917525773195877e-06, + "loss": 2.8813, + "step": 19 + }, + { + "epoch": 0.04116285052739902, + "grad_norm": 1.8188731670379639, + "learning_rate": 4.123711340206186e-06, + "loss": 2.8401, + "step": 20 + }, + { + "epoch": 0.043220993053768975, + "grad_norm": 1.7305251359939575, + "learning_rate": 4.329896907216495e-06, + "loss": 2.7478, + "step": 21 + }, + { + "epoch": 0.04527913558013893, + "grad_norm": 1.7014551162719727, + "learning_rate": 4.536082474226804e-06, + "loss": 2.7356, + "step": 22 + }, + { + "epoch": 0.047337278106508875, + "grad_norm": 1.677381157875061, + "learning_rate": 4.742268041237113e-06, + "loss": 2.7593, + "step": 23 + }, + { + "epoch": 0.04939542063287883, + "grad_norm": 1.628554344177246, + "learning_rate": 4.948453608247423e-06, + "loss": 2.7689, + "step": 24 + }, + { + "epoch": 0.051453563159248775, + "grad_norm": 1.4968128204345703, + "learning_rate": 5.154639175257732e-06, + "loss": 2.6613, + "step": 25 + }, + { + "epoch": 0.05351170568561873, + "grad_norm": 1.4734832048416138, + "learning_rate": 5.360824742268042e-06, + "loss": 2.7095, + "step": 26 + }, + { + "epoch": 0.05556984821198868, + "grad_norm": 1.3745571374893188, + "learning_rate": 5.567010309278351e-06, + "loss": 2.655, + "step": 27 + }, + { + "epoch": 0.05762799073835863, + "grad_norm": 1.3381729125976562, + "learning_rate": 5.7731958762886594e-06, + "loss": 2.55, + "step": 28 + }, + { + "epoch": 0.05968613326472858, + "grad_norm": 1.3388073444366455, + "learning_rate": 5.979381443298969e-06, + "loss": 2.5219, + "step": 29 + }, + { + "epoch": 0.061744275791098535, + "grad_norm": 1.317008376121521, + "learning_rate": 6.185567010309279e-06, + "loss": 2.4491, + "step": 30 + }, + { + "epoch": 0.06380241831746848, + "grad_norm": 1.3210794925689697, + "learning_rate": 6.391752577319588e-06, + "loss": 2.4358, + "step": 31 + }, + { + "epoch": 0.06586056084383844, + "grad_norm": 1.182519555091858, + "learning_rate": 6.597938144329898e-06, + "loss": 2.4514, + "step": 32 + }, + { + "epoch": 0.06791870337020839, + "grad_norm": 1.2238099575042725, + "learning_rate": 6.804123711340207e-06, + "loss": 2.442, + "step": 33 + }, + { + "epoch": 0.06997684589657834, + "grad_norm": 1.1793314218521118, + "learning_rate": 7.010309278350515e-06, + "loss": 2.3864, + "step": 34 + }, + { + "epoch": 0.0720349884229483, + "grad_norm": 1.1983020305633545, + "learning_rate": 7.216494845360825e-06, + "loss": 2.3796, + "step": 35 + }, + { + "epoch": 0.07409313094931824, + "grad_norm": 1.2189652919769287, + "learning_rate": 7.422680412371135e-06, + "loss": 2.4152, + "step": 36 + }, + { + "epoch": 0.07615127347568819, + "grad_norm": 1.14923095703125, + "learning_rate": 7.628865979381444e-06, + "loss": 2.3298, + "step": 37 + }, + { + "epoch": 0.07820941600205815, + "grad_norm": 1.147013545036316, + "learning_rate": 7.835051546391754e-06, + "loss": 2.2488, + "step": 38 + }, + { + "epoch": 0.0802675585284281, + "grad_norm": 1.133981466293335, + "learning_rate": 8.041237113402063e-06, + "loss": 2.1825, + "step": 39 + }, + { + "epoch": 0.08232570105479804, + "grad_norm": 1.1686867475509644, + "learning_rate": 8.247422680412371e-06, + "loss": 2.2282, + "step": 40 + }, + { + "epoch": 0.084383843581168, + "grad_norm": 1.131690502166748, + "learning_rate": 8.453608247422681e-06, + "loss": 2.0962, + "step": 41 + }, + { + "epoch": 0.08644198610753795, + "grad_norm": 1.1626195907592773, + "learning_rate": 8.65979381443299e-06, + "loss": 2.1161, + "step": 42 + }, + { + "epoch": 0.0885001286339079, + "grad_norm": 1.1508581638336182, + "learning_rate": 8.865979381443299e-06, + "loss": 1.9856, + "step": 43 + }, + { + "epoch": 0.09055827116027786, + "grad_norm": 1.2286733388900757, + "learning_rate": 9.072164948453609e-06, + "loss": 2.076, + "step": 44 + }, + { + "epoch": 0.0926164136866478, + "grad_norm": 1.82068932056427, + "learning_rate": 9.278350515463918e-06, + "loss": 1.9995, + "step": 45 + }, + { + "epoch": 0.09467455621301775, + "grad_norm": 2.079101324081421, + "learning_rate": 9.484536082474226e-06, + "loss": 1.9601, + "step": 46 + }, + { + "epoch": 0.0967326987393877, + "grad_norm": 1.1209226846694946, + "learning_rate": 9.690721649484536e-06, + "loss": 1.9346, + "step": 47 + }, + { + "epoch": 0.09879084126575766, + "grad_norm": 1.0579711198806763, + "learning_rate": 9.896907216494846e-06, + "loss": 1.8764, + "step": 48 + }, + { + "epoch": 0.1008489837921276, + "grad_norm": 1.0434011220932007, + "learning_rate": 1.0103092783505156e-05, + "loss": 1.8483, + "step": 49 + }, + { + "epoch": 0.10290712631849755, + "grad_norm": 1.0089991092681885, + "learning_rate": 1.0309278350515464e-05, + "loss": 1.8018, + "step": 50 + }, + { + "epoch": 0.10496526884486751, + "grad_norm": 1.0117324590682983, + "learning_rate": 1.0515463917525775e-05, + "loss": 1.8003, + "step": 51 + }, + { + "epoch": 0.10702341137123746, + "grad_norm": 1.0006697177886963, + "learning_rate": 1.0721649484536083e-05, + "loss": 1.7482, + "step": 52 + }, + { + "epoch": 0.1090815538976074, + "grad_norm": 2.1164329051971436, + "learning_rate": 1.0927835051546391e-05, + "loss": 1.7363, + "step": 53 + }, + { + "epoch": 0.11113969642397736, + "grad_norm": 0.9573502540588379, + "learning_rate": 1.1134020618556703e-05, + "loss": 1.661, + "step": 54 + }, + { + "epoch": 0.11319783895034731, + "grad_norm": 1.0059764385223389, + "learning_rate": 1.134020618556701e-05, + "loss": 1.6979, + "step": 55 + }, + { + "epoch": 0.11525598147671726, + "grad_norm": 0.9719656109809875, + "learning_rate": 1.1546391752577319e-05, + "loss": 1.6318, + "step": 56 + }, + { + "epoch": 0.11731412400308722, + "grad_norm": 1.0024539232254028, + "learning_rate": 1.175257731958763e-05, + "loss": 1.6283, + "step": 57 + }, + { + "epoch": 0.11937226652945716, + "grad_norm": 0.9772456288337708, + "learning_rate": 1.1958762886597938e-05, + "loss": 1.5611, + "step": 58 + }, + { + "epoch": 0.12143040905582711, + "grad_norm": 0.9947625994682312, + "learning_rate": 1.2164948453608248e-05, + "loss": 1.6073, + "step": 59 + }, + { + "epoch": 0.12348855158219707, + "grad_norm": 2.112889051437378, + "learning_rate": 1.2371134020618558e-05, + "loss": 1.6208, + "step": 60 + }, + { + "epoch": 0.12554669410856703, + "grad_norm": 1.0515345335006714, + "learning_rate": 1.2577319587628866e-05, + "loss": 1.569, + "step": 61 + }, + { + "epoch": 0.12760483663493696, + "grad_norm": 1.0782145261764526, + "learning_rate": 1.2783505154639176e-05, + "loss": 1.5097, + "step": 62 + }, + { + "epoch": 0.12966297916130692, + "grad_norm": 1.154104232788086, + "learning_rate": 1.2989690721649485e-05, + "loss": 1.5472, + "step": 63 + }, + { + "epoch": 0.13172112168767688, + "grad_norm": 1.1614656448364258, + "learning_rate": 1.3195876288659795e-05, + "loss": 1.4833, + "step": 64 + }, + { + "epoch": 0.13377926421404682, + "grad_norm": 1.1720911264419556, + "learning_rate": 1.3402061855670103e-05, + "loss": 1.4644, + "step": 65 + }, + { + "epoch": 0.13583740674041678, + "grad_norm": 1.8903896808624268, + "learning_rate": 1.3608247422680415e-05, + "loss": 1.4286, + "step": 66 + }, + { + "epoch": 0.13789554926678674, + "grad_norm": 1.2675013542175293, + "learning_rate": 1.3814432989690723e-05, + "loss": 1.416, + "step": 67 + }, + { + "epoch": 0.13995369179315667, + "grad_norm": 1.266434907913208, + "learning_rate": 1.402061855670103e-05, + "loss": 1.3171, + "step": 68 + }, + { + "epoch": 0.14201183431952663, + "grad_norm": 1.3408889770507812, + "learning_rate": 1.4226804123711342e-05, + "loss": 1.3396, + "step": 69 + }, + { + "epoch": 0.1440699768458966, + "grad_norm": 1.3862446546554565, + "learning_rate": 1.443298969072165e-05, + "loss": 1.2642, + "step": 70 + }, + { + "epoch": 0.14612811937226652, + "grad_norm": 2.110553026199341, + "learning_rate": 1.4639175257731958e-05, + "loss": 1.2593, + "step": 71 + }, + { + "epoch": 0.14818626189863648, + "grad_norm": 1.7017499208450317, + "learning_rate": 1.484536082474227e-05, + "loss": 1.24, + "step": 72 + }, + { + "epoch": 0.15024440442500644, + "grad_norm": 1.9851700067520142, + "learning_rate": 1.5051546391752578e-05, + "loss": 1.2313, + "step": 73 + }, + { + "epoch": 0.15230254695137638, + "grad_norm": 2.009608030319214, + "learning_rate": 1.5257731958762888e-05, + "loss": 1.1281, + "step": 74 + }, + { + "epoch": 0.15436068947774634, + "grad_norm": 2.7587485313415527, + "learning_rate": 1.5463917525773197e-05, + "loss": 1.1248, + "step": 75 + }, + { + "epoch": 0.1564188320041163, + "grad_norm": 2.780954599380493, + "learning_rate": 1.5670103092783507e-05, + "loss": 1.0797, + "step": 76 + }, + { + "epoch": 0.15847697453048623, + "grad_norm": 3.1470866203308105, + "learning_rate": 1.5876288659793813e-05, + "loss": 1.0064, + "step": 77 + }, + { + "epoch": 0.1605351170568562, + "grad_norm": 4.653595447540283, + "learning_rate": 1.6082474226804127e-05, + "loss": 0.9219, + "step": 78 + }, + { + "epoch": 0.16259325958322615, + "grad_norm": 4.157363414764404, + "learning_rate": 1.6288659793814433e-05, + "loss": 0.8709, + "step": 79 + }, + { + "epoch": 0.16465140210959608, + "grad_norm": 4.5814924240112305, + "learning_rate": 1.6494845360824743e-05, + "loss": 0.7693, + "step": 80 + }, + { + "epoch": 0.16670954463596604, + "grad_norm": 5.096139907836914, + "learning_rate": 1.6701030927835052e-05, + "loss": 0.6868, + "step": 81 + }, + { + "epoch": 0.168767687162336, + "grad_norm": 4.858880519866943, + "learning_rate": 1.6907216494845362e-05, + "loss": 0.5971, + "step": 82 + }, + { + "epoch": 0.17082582968870594, + "grad_norm": 4.42564582824707, + "learning_rate": 1.7113402061855672e-05, + "loss": 0.4719, + "step": 83 + }, + { + "epoch": 0.1728839722150759, + "grad_norm": 7.720851421356201, + "learning_rate": 1.731958762886598e-05, + "loss": 0.3943, + "step": 84 + }, + { + "epoch": 0.17494211474144586, + "grad_norm": 0.41923192143440247, + "learning_rate": 1.752577319587629e-05, + "loss": 0.3635, + "step": 85 + }, + { + "epoch": 0.1770002572678158, + "grad_norm": 0.2771846354007721, + "learning_rate": 1.7731958762886598e-05, + "loss": 0.3597, + "step": 86 + }, + { + "epoch": 0.17905839979418575, + "grad_norm": 0.24761857092380524, + "learning_rate": 1.793814432989691e-05, + "loss": 0.3735, + "step": 87 + }, + { + "epoch": 0.1811165423205557, + "grad_norm": 0.23277048766613007, + "learning_rate": 1.8144329896907217e-05, + "loss": 0.3643, + "step": 88 + }, + { + "epoch": 0.18317468484692565, + "grad_norm": 0.22931228578090668, + "learning_rate": 1.8350515463917527e-05, + "loss": 0.3519, + "step": 89 + }, + { + "epoch": 0.1852328273732956, + "grad_norm": 0.20750615000724792, + "learning_rate": 1.8556701030927837e-05, + "loss": 0.3431, + "step": 90 + }, + { + "epoch": 0.18729096989966554, + "grad_norm": 0.2080322951078415, + "learning_rate": 1.8762886597938147e-05, + "loss": 0.3632, + "step": 91 + }, + { + "epoch": 0.1893491124260355, + "grad_norm": 0.20186181366443634, + "learning_rate": 1.8969072164948453e-05, + "loss": 0.3492, + "step": 92 + }, + { + "epoch": 0.19140725495240546, + "grad_norm": 0.19172786176204681, + "learning_rate": 1.9175257731958766e-05, + "loss": 0.3552, + "step": 93 + }, + { + "epoch": 0.1934653974787754, + "grad_norm": 0.1747850626707077, + "learning_rate": 1.9381443298969072e-05, + "loss": 0.3355, + "step": 94 + }, + { + "epoch": 0.19552354000514535, + "grad_norm": 0.196411594748497, + "learning_rate": 1.9587628865979382e-05, + "loss": 0.3271, + "step": 95 + }, + { + "epoch": 0.1975816825315153, + "grad_norm": 0.20063228905200958, + "learning_rate": 1.9793814432989692e-05, + "loss": 0.3351, + "step": 96 + }, + { + "epoch": 0.19963982505788525, + "grad_norm": 0.19240939617156982, + "learning_rate": 2e-05, + "loss": 0.3266, + "step": 97 + }, + { + "epoch": 0.2016979675842552, + "grad_norm": 0.18206572532653809, + "learning_rate": 1.997709049255441e-05, + "loss": 0.3393, + "step": 98 + }, + { + "epoch": 0.20375611011062517, + "grad_norm": 0.20384562015533447, + "learning_rate": 1.9954180985108823e-05, + "loss": 0.3395, + "step": 99 + }, + { + "epoch": 0.2058142526369951, + "grad_norm": 0.19944581389427185, + "learning_rate": 1.9931271477663232e-05, + "loss": 0.3268, + "step": 100 + }, + { + "epoch": 0.2058142526369951, + "eval_loss": 0.3456890285015106, + "eval_runtime": 2114.0178, + "eval_samples_per_second": 3.677, + "eval_steps_per_second": 0.92, + "step": 100 + }, + { + "epoch": 0.20787239516336506, + "grad_norm": 0.17743557691574097, + "learning_rate": 1.990836197021764e-05, + "loss": 0.3439, + "step": 101 + }, + { + "epoch": 0.20993053768973502, + "grad_norm": 0.18746449053287506, + "learning_rate": 1.9885452462772053e-05, + "loss": 0.326, + "step": 102 + }, + { + "epoch": 0.21198868021610495, + "grad_norm": 0.18555815517902374, + "learning_rate": 1.9862542955326462e-05, + "loss": 0.3337, + "step": 103 + }, + { + "epoch": 0.2140468227424749, + "grad_norm": 0.16591575741767883, + "learning_rate": 1.9839633447880874e-05, + "loss": 0.3121, + "step": 104 + }, + { + "epoch": 0.21610496526884487, + "grad_norm": 0.1621987372636795, + "learning_rate": 1.9816723940435283e-05, + "loss": 0.3287, + "step": 105 + }, + { + "epoch": 0.2181631077952148, + "grad_norm": 0.1614532470703125, + "learning_rate": 1.9793814432989692e-05, + "loss": 0.3306, + "step": 106 + }, + { + "epoch": 0.22022125032158477, + "grad_norm": 0.17993387579917908, + "learning_rate": 1.9770904925544104e-05, + "loss": 0.3341, + "step": 107 + }, + { + "epoch": 0.22227939284795473, + "grad_norm": 0.1550011783838272, + "learning_rate": 1.9747995418098513e-05, + "loss": 0.3197, + "step": 108 + }, + { + "epoch": 0.22433753537432466, + "grad_norm": 0.18471524119377136, + "learning_rate": 1.9725085910652922e-05, + "loss": 0.3285, + "step": 109 + }, + { + "epoch": 0.22639567790069462, + "grad_norm": 0.15604373812675476, + "learning_rate": 1.9702176403207334e-05, + "loss": 0.3298, + "step": 110 + }, + { + "epoch": 0.22845382042706458, + "grad_norm": 0.1682298630475998, + "learning_rate": 1.9679266895761743e-05, + "loss": 0.3343, + "step": 111 + }, + { + "epoch": 0.2305119629534345, + "grad_norm": 0.14933635294437408, + "learning_rate": 1.9656357388316152e-05, + "loss": 0.3134, + "step": 112 + }, + { + "epoch": 0.23257010547980447, + "grad_norm": 0.14892347157001495, + "learning_rate": 1.963344788087056e-05, + "loss": 0.3154, + "step": 113 + }, + { + "epoch": 0.23462824800617443, + "grad_norm": 0.1577889323234558, + "learning_rate": 1.9610538373424973e-05, + "loss": 0.3122, + "step": 114 + }, + { + "epoch": 0.23668639053254437, + "grad_norm": 0.16482344269752502, + "learning_rate": 1.9587628865979382e-05, + "loss": 0.3193, + "step": 115 + }, + { + "epoch": 0.23874453305891433, + "grad_norm": 0.15328913927078247, + "learning_rate": 1.956471935853379e-05, + "loss": 0.3217, + "step": 116 + }, + { + "epoch": 0.2408026755852843, + "grad_norm": 0.16140656173229218, + "learning_rate": 1.9541809851088203e-05, + "loss": 0.318, + "step": 117 + }, + { + "epoch": 0.24286081811165422, + "grad_norm": 0.15448373556137085, + "learning_rate": 1.9518900343642612e-05, + "loss": 0.3205, + "step": 118 + }, + { + "epoch": 0.24491896063802418, + "grad_norm": 0.14716887474060059, + "learning_rate": 1.9495990836197025e-05, + "loss": 0.3164, + "step": 119 + }, + { + "epoch": 0.24697710316439414, + "grad_norm": 0.16582027077674866, + "learning_rate": 1.9473081328751433e-05, + "loss": 0.3191, + "step": 120 + }, + { + "epoch": 0.24903524569076407, + "grad_norm": 0.15213699638843536, + "learning_rate": 1.9450171821305842e-05, + "loss": 0.304, + "step": 121 + }, + { + "epoch": 0.25109338821713406, + "grad_norm": 0.1659238487482071, + "learning_rate": 1.9427262313860255e-05, + "loss": 0.3184, + "step": 122 + }, + { + "epoch": 0.253151530743504, + "grad_norm": 0.15596656501293182, + "learning_rate": 1.9404352806414663e-05, + "loss": 0.3092, + "step": 123 + }, + { + "epoch": 0.2552096732698739, + "grad_norm": 0.15868476033210754, + "learning_rate": 1.9381443298969072e-05, + "loss": 0.3163, + "step": 124 + }, + { + "epoch": 0.2572678157962439, + "grad_norm": 0.15386095643043518, + "learning_rate": 1.9358533791523485e-05, + "loss": 0.3049, + "step": 125 + }, + { + "epoch": 0.25932595832261385, + "grad_norm": 0.15179213881492615, + "learning_rate": 1.9335624284077894e-05, + "loss": 0.3131, + "step": 126 + }, + { + "epoch": 0.2613841008489838, + "grad_norm": 0.1595134735107422, + "learning_rate": 1.9312714776632306e-05, + "loss": 0.3069, + "step": 127 + }, + { + "epoch": 0.26344224337535377, + "grad_norm": 0.16989803314208984, + "learning_rate": 1.9289805269186715e-05, + "loss": 0.3052, + "step": 128 + }, + { + "epoch": 0.2655003859017237, + "grad_norm": 0.14803892374038696, + "learning_rate": 1.9266895761741124e-05, + "loss": 0.3065, + "step": 129 + }, + { + "epoch": 0.26755852842809363, + "grad_norm": 0.16676583886146545, + "learning_rate": 1.9243986254295536e-05, + "loss": 0.2962, + "step": 130 + }, + { + "epoch": 0.2696166709544636, + "grad_norm": 0.15694552659988403, + "learning_rate": 1.9221076746849945e-05, + "loss": 0.3096, + "step": 131 + }, + { + "epoch": 0.27167481348083355, + "grad_norm": 0.17696696519851685, + "learning_rate": 1.9198167239404354e-05, + "loss": 0.3145, + "step": 132 + }, + { + "epoch": 0.2737329560072035, + "grad_norm": 0.17204038798809052, + "learning_rate": 1.9175257731958766e-05, + "loss": 0.3248, + "step": 133 + }, + { + "epoch": 0.2757910985335735, + "grad_norm": 0.15630359947681427, + "learning_rate": 1.9152348224513175e-05, + "loss": 0.3117, + "step": 134 + }, + { + "epoch": 0.2778492410599434, + "grad_norm": 0.15757997334003448, + "learning_rate": 1.9129438717067584e-05, + "loss": 0.3145, + "step": 135 + }, + { + "epoch": 0.27990738358631334, + "grad_norm": 0.16273653507232666, + "learning_rate": 1.9106529209621996e-05, + "loss": 0.3159, + "step": 136 + }, + { + "epoch": 0.28196552611268333, + "grad_norm": 0.16213104128837585, + "learning_rate": 1.9083619702176405e-05, + "loss": 0.2949, + "step": 137 + }, + { + "epoch": 0.28402366863905326, + "grad_norm": 0.15377865731716156, + "learning_rate": 1.9060710194730814e-05, + "loss": 0.306, + "step": 138 + }, + { + "epoch": 0.2860818111654232, + "grad_norm": 0.1545962244272232, + "learning_rate": 1.9037800687285223e-05, + "loss": 0.2966, + "step": 139 + }, + { + "epoch": 0.2881399536917932, + "grad_norm": 0.15516617894172668, + "learning_rate": 1.9014891179839635e-05, + "loss": 0.3122, + "step": 140 + }, + { + "epoch": 0.2901980962181631, + "grad_norm": 0.14734458923339844, + "learning_rate": 1.8991981672394044e-05, + "loss": 0.3118, + "step": 141 + }, + { + "epoch": 0.29225623874453305, + "grad_norm": 0.1644304096698761, + "learning_rate": 1.8969072164948453e-05, + "loss": 0.3027, + "step": 142 + }, + { + "epoch": 0.29431438127090304, + "grad_norm": 0.14632569253444672, + "learning_rate": 1.8946162657502865e-05, + "loss": 0.3023, + "step": 143 + }, + { + "epoch": 0.29637252379727297, + "grad_norm": 0.1573137789964676, + "learning_rate": 1.8923253150057274e-05, + "loss": 0.3102, + "step": 144 + }, + { + "epoch": 0.2984306663236429, + "grad_norm": 0.16423144936561584, + "learning_rate": 1.8900343642611686e-05, + "loss": 0.3033, + "step": 145 + }, + { + "epoch": 0.3004888088500129, + "grad_norm": 0.15420907735824585, + "learning_rate": 1.8877434135166095e-05, + "loss": 0.3089, + "step": 146 + }, + { + "epoch": 0.3025469513763828, + "grad_norm": 0.1579178273677826, + "learning_rate": 1.8854524627720504e-05, + "loss": 0.3071, + "step": 147 + }, + { + "epoch": 0.30460509390275275, + "grad_norm": 0.15866397321224213, + "learning_rate": 1.8831615120274916e-05, + "loss": 0.3083, + "step": 148 + }, + { + "epoch": 0.30666323642912274, + "grad_norm": 0.16651487350463867, + "learning_rate": 1.8808705612829325e-05, + "loss": 0.3099, + "step": 149 + }, + { + "epoch": 0.3087213789554927, + "grad_norm": 0.16281908750534058, + "learning_rate": 1.8785796105383734e-05, + "loss": 0.3034, + "step": 150 + }, + { + "epoch": 0.3107795214818626, + "grad_norm": 0.17449837923049927, + "learning_rate": 1.8762886597938147e-05, + "loss": 0.3054, + "step": 151 + }, + { + "epoch": 0.3128376640082326, + "grad_norm": 0.15403546392917633, + "learning_rate": 1.8739977090492555e-05, + "loss": 0.297, + "step": 152 + }, + { + "epoch": 0.31489580653460253, + "grad_norm": 0.1472466140985489, + "learning_rate": 1.8717067583046968e-05, + "loss": 0.2973, + "step": 153 + }, + { + "epoch": 0.31695394906097246, + "grad_norm": 0.16027937829494476, + "learning_rate": 1.8694158075601377e-05, + "loss": 0.3054, + "step": 154 + }, + { + "epoch": 0.31901209158734245, + "grad_norm": 0.17086225748062134, + "learning_rate": 1.8671248568155786e-05, + "loss": 0.307, + "step": 155 + }, + { + "epoch": 0.3210702341137124, + "grad_norm": 0.15930697321891785, + "learning_rate": 1.8648339060710198e-05, + "loss": 0.293, + "step": 156 + }, + { + "epoch": 0.3231283766400823, + "grad_norm": 0.17086376249790192, + "learning_rate": 1.8625429553264607e-05, + "loss": 0.293, + "step": 157 + }, + { + "epoch": 0.3251865191664523, + "grad_norm": 0.15970875322818756, + "learning_rate": 1.8602520045819016e-05, + "loss": 0.3083, + "step": 158 + }, + { + "epoch": 0.32724466169282224, + "grad_norm": 0.16355909407138824, + "learning_rate": 1.8579610538373428e-05, + "loss": 0.3139, + "step": 159 + }, + { + "epoch": 0.32930280421919217, + "grad_norm": 0.15183711051940918, + "learning_rate": 1.8556701030927837e-05, + "loss": 0.2953, + "step": 160 + }, + { + "epoch": 0.33136094674556216, + "grad_norm": 0.15123715996742249, + "learning_rate": 1.853379152348225e-05, + "loss": 0.3025, + "step": 161 + }, + { + "epoch": 0.3334190892719321, + "grad_norm": 0.1576143503189087, + "learning_rate": 1.8510882016036658e-05, + "loss": 0.2904, + "step": 162 + }, + { + "epoch": 0.335477231798302, + "grad_norm": 0.1457504779100418, + "learning_rate": 1.8487972508591067e-05, + "loss": 0.2909, + "step": 163 + }, + { + "epoch": 0.337535374324672, + "grad_norm": 0.1557442992925644, + "learning_rate": 1.846506300114548e-05, + "loss": 0.3027, + "step": 164 + }, + { + "epoch": 0.33959351685104194, + "grad_norm": 0.15662318468093872, + "learning_rate": 1.8442153493699888e-05, + "loss": 0.311, + "step": 165 + }, + { + "epoch": 0.3416516593774119, + "grad_norm": 0.16177058219909668, + "learning_rate": 1.8419243986254297e-05, + "loss": 0.2944, + "step": 166 + }, + { + "epoch": 0.34370980190378186, + "grad_norm": 0.16406729817390442, + "learning_rate": 1.8396334478808706e-05, + "loss": 0.2927, + "step": 167 + }, + { + "epoch": 0.3457679444301518, + "grad_norm": 0.16642791032791138, + "learning_rate": 1.8373424971363115e-05, + "loss": 0.3063, + "step": 168 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.1650693714618683, + "learning_rate": 1.8350515463917527e-05, + "loss": 0.2957, + "step": 169 + }, + { + "epoch": 0.3498842294828917, + "grad_norm": 0.15349675714969635, + "learning_rate": 1.8327605956471936e-05, + "loss": 0.297, + "step": 170 + }, + { + "epoch": 0.35194237200926165, + "grad_norm": 0.17770209908485413, + "learning_rate": 1.8304696449026348e-05, + "loss": 0.3011, + "step": 171 + }, + { + "epoch": 0.3540005145356316, + "grad_norm": 0.1647631675004959, + "learning_rate": 1.8281786941580757e-05, + "loss": 0.2962, + "step": 172 + }, + { + "epoch": 0.35605865706200157, + "grad_norm": 0.1603834480047226, + "learning_rate": 1.8258877434135166e-05, + "loss": 0.2937, + "step": 173 + }, + { + "epoch": 0.3581167995883715, + "grad_norm": 0.16780880093574524, + "learning_rate": 1.8235967926689578e-05, + "loss": 0.2997, + "step": 174 + }, + { + "epoch": 0.36017494211474144, + "grad_norm": 0.15976767241954803, + "learning_rate": 1.8213058419243987e-05, + "loss": 0.3043, + "step": 175 + }, + { + "epoch": 0.3622330846411114, + "grad_norm": 0.16236485540866852, + "learning_rate": 1.8190148911798396e-05, + "loss": 0.3069, + "step": 176 + }, + { + "epoch": 0.36429122716748136, + "grad_norm": 0.16391968727111816, + "learning_rate": 1.816723940435281e-05, + "loss": 0.2923, + "step": 177 + }, + { + "epoch": 0.3663493696938513, + "grad_norm": 0.15806889533996582, + "learning_rate": 1.8144329896907217e-05, + "loss": 0.2872, + "step": 178 + }, + { + "epoch": 0.3684075122202212, + "grad_norm": 0.1627352088689804, + "learning_rate": 1.812142038946163e-05, + "loss": 0.3032, + "step": 179 + }, + { + "epoch": 0.3704656547465912, + "grad_norm": 0.15103371441364288, + "learning_rate": 1.809851088201604e-05, + "loss": 0.2847, + "step": 180 + }, + { + "epoch": 0.37252379727296114, + "grad_norm": 0.15178488194942474, + "learning_rate": 1.8075601374570447e-05, + "loss": 0.3017, + "step": 181 + }, + { + "epoch": 0.3745819397993311, + "grad_norm": 0.15493899583816528, + "learning_rate": 1.805269186712486e-05, + "loss": 0.2901, + "step": 182 + }, + { + "epoch": 0.37664008232570106, + "grad_norm": 0.15990686416625977, + "learning_rate": 1.802978235967927e-05, + "loss": 0.2861, + "step": 183 + }, + { + "epoch": 0.378698224852071, + "grad_norm": 0.15824148058891296, + "learning_rate": 1.8006872852233677e-05, + "loss": 0.2885, + "step": 184 + }, + { + "epoch": 0.38075636737844093, + "grad_norm": 0.15690775215625763, + "learning_rate": 1.798396334478809e-05, + "loss": 0.2814, + "step": 185 + }, + { + "epoch": 0.3828145099048109, + "grad_norm": 0.15833796560764313, + "learning_rate": 1.79610538373425e-05, + "loss": 0.2847, + "step": 186 + }, + { + "epoch": 0.38487265243118085, + "grad_norm": 0.16560044884681702, + "learning_rate": 1.793814432989691e-05, + "loss": 0.3061, + "step": 187 + }, + { + "epoch": 0.3869307949575508, + "grad_norm": 0.16240179538726807, + "learning_rate": 1.791523482245132e-05, + "loss": 0.2943, + "step": 188 + }, + { + "epoch": 0.38898893748392077, + "grad_norm": 0.15825721621513367, + "learning_rate": 1.789232531500573e-05, + "loss": 0.2934, + "step": 189 + }, + { + "epoch": 0.3910470800102907, + "grad_norm": 0.16665388643741608, + "learning_rate": 1.786941580756014e-05, + "loss": 0.291, + "step": 190 + }, + { + "epoch": 0.39310522253666064, + "grad_norm": 0.16581200063228607, + "learning_rate": 1.784650630011455e-05, + "loss": 0.2849, + "step": 191 + }, + { + "epoch": 0.3951633650630306, + "grad_norm": 0.1604345291852951, + "learning_rate": 1.782359679266896e-05, + "loss": 0.3, + "step": 192 + }, + { + "epoch": 0.39722150758940056, + "grad_norm": 0.16107915341854095, + "learning_rate": 1.7800687285223368e-05, + "loss": 0.2847, + "step": 193 + }, + { + "epoch": 0.3992796501157705, + "grad_norm": 0.1571730375289917, + "learning_rate": 1.7777777777777777e-05, + "loss": 0.2863, + "step": 194 + }, + { + "epoch": 0.4013377926421405, + "grad_norm": 0.1656399518251419, + "learning_rate": 1.775486827033219e-05, + "loss": 0.2878, + "step": 195 + }, + { + "epoch": 0.4033959351685104, + "grad_norm": 0.16738460958003998, + "learning_rate": 1.7731958762886598e-05, + "loss": 0.286, + "step": 196 + }, + { + "epoch": 0.40545407769488034, + "grad_norm": 0.16704292595386505, + "learning_rate": 1.770904925544101e-05, + "loss": 0.2919, + "step": 197 + }, + { + "epoch": 0.40751222022125033, + "grad_norm": 0.16215579211711884, + "learning_rate": 1.768613974799542e-05, + "loss": 0.2874, + "step": 198 + }, + { + "epoch": 0.40957036274762026, + "grad_norm": 0.15573479235172272, + "learning_rate": 1.7663230240549828e-05, + "loss": 0.2904, + "step": 199 + }, + { + "epoch": 0.4116285052739902, + "grad_norm": 0.1707623153924942, + "learning_rate": 1.764032073310424e-05, + "loss": 0.289, + "step": 200 + }, + { + "epoch": 0.4116285052739902, + "eval_loss": 0.3214050829410553, + "eval_runtime": 2449.7742, + "eval_samples_per_second": 3.173, + "eval_steps_per_second": 0.794, + "step": 200 + }, + { + "epoch": 0.4136866478003602, + "grad_norm": 0.1699172556400299, + "learning_rate": 1.761741122565865e-05, + "loss": 0.2852, + "step": 201 + }, + { + "epoch": 0.4157447903267301, + "grad_norm": 0.19150058925151825, + "learning_rate": 1.7594501718213058e-05, + "loss": 0.29, + "step": 202 + }, + { + "epoch": 0.41780293285310005, + "grad_norm": 0.15794627368450165, + "learning_rate": 1.757159221076747e-05, + "loss": 0.2746, + "step": 203 + }, + { + "epoch": 0.41986107537947004, + "grad_norm": 0.17305190861225128, + "learning_rate": 1.754868270332188e-05, + "loss": 0.3003, + "step": 204 + }, + { + "epoch": 0.42191921790583997, + "grad_norm": 0.16257523000240326, + "learning_rate": 1.752577319587629e-05, + "loss": 0.2789, + "step": 205 + }, + { + "epoch": 0.4239773604322099, + "grad_norm": 0.17273619771003723, + "learning_rate": 1.75028636884307e-05, + "loss": 0.2917, + "step": 206 + }, + { + "epoch": 0.4260355029585799, + "grad_norm": 0.17502790689468384, + "learning_rate": 1.747995418098511e-05, + "loss": 0.2992, + "step": 207 + }, + { + "epoch": 0.4280936454849498, + "grad_norm": 0.16464050114154816, + "learning_rate": 1.745704467353952e-05, + "loss": 0.2873, + "step": 208 + }, + { + "epoch": 0.43015178801131976, + "grad_norm": 0.1681668758392334, + "learning_rate": 1.743413516609393e-05, + "loss": 0.2991, + "step": 209 + }, + { + "epoch": 0.43220993053768975, + "grad_norm": 0.16957956552505493, + "learning_rate": 1.741122565864834e-05, + "loss": 0.2868, + "step": 210 + }, + { + "epoch": 0.4342680730640597, + "grad_norm": 0.15875883400440216, + "learning_rate": 1.738831615120275e-05, + "loss": 0.2946, + "step": 211 + }, + { + "epoch": 0.4363262155904296, + "grad_norm": 0.18127889931201935, + "learning_rate": 1.736540664375716e-05, + "loss": 0.2835, + "step": 212 + }, + { + "epoch": 0.4383843581167996, + "grad_norm": 0.17822811007499695, + "learning_rate": 1.7342497136311573e-05, + "loss": 0.2944, + "step": 213 + }, + { + "epoch": 0.44044250064316953, + "grad_norm": 0.17555806040763855, + "learning_rate": 1.731958762886598e-05, + "loss": 0.3001, + "step": 214 + }, + { + "epoch": 0.44250064316953946, + "grad_norm": 0.18709121644496918, + "learning_rate": 1.729667812142039e-05, + "loss": 0.282, + "step": 215 + }, + { + "epoch": 0.44455878569590945, + "grad_norm": 0.16322475671768188, + "learning_rate": 1.7273768613974803e-05, + "loss": 0.2883, + "step": 216 + }, + { + "epoch": 0.4466169282222794, + "grad_norm": 0.1677054911851883, + "learning_rate": 1.7250859106529212e-05, + "loss": 0.28, + "step": 217 + }, + { + "epoch": 0.4486750707486493, + "grad_norm": 0.15764063596725464, + "learning_rate": 1.722794959908362e-05, + "loss": 0.2768, + "step": 218 + }, + { + "epoch": 0.4507332132750193, + "grad_norm": 0.16166841983795166, + "learning_rate": 1.7205040091638033e-05, + "loss": 0.2868, + "step": 219 + }, + { + "epoch": 0.45279135580138924, + "grad_norm": 0.1799350380897522, + "learning_rate": 1.7182130584192442e-05, + "loss": 0.2891, + "step": 220 + }, + { + "epoch": 0.45484949832775917, + "grad_norm": 0.18119174242019653, + "learning_rate": 1.715922107674685e-05, + "loss": 0.2841, + "step": 221 + }, + { + "epoch": 0.45690764085412916, + "grad_norm": 0.17725548148155212, + "learning_rate": 1.713631156930126e-05, + "loss": 0.3038, + "step": 222 + }, + { + "epoch": 0.4589657833804991, + "grad_norm": 0.1628233790397644, + "learning_rate": 1.7113402061855672e-05, + "loss": 0.2868, + "step": 223 + }, + { + "epoch": 0.461023925906869, + "grad_norm": 0.1745166927576065, + "learning_rate": 1.709049255441008e-05, + "loss": 0.3033, + "step": 224 + }, + { + "epoch": 0.463082068433239, + "grad_norm": 0.17708267271518707, + "learning_rate": 1.706758304696449e-05, + "loss": 0.2842, + "step": 225 + }, + { + "epoch": 0.46514021095960895, + "grad_norm": 0.1738453358411789, + "learning_rate": 1.7044673539518902e-05, + "loss": 0.3005, + "step": 226 + }, + { + "epoch": 0.4671983534859789, + "grad_norm": 0.1706874966621399, + "learning_rate": 1.702176403207331e-05, + "loss": 0.2924, + "step": 227 + }, + { + "epoch": 0.46925649601234887, + "grad_norm": 0.1697423756122589, + "learning_rate": 1.699885452462772e-05, + "loss": 0.2783, + "step": 228 + }, + { + "epoch": 0.4713146385387188, + "grad_norm": 0.1783403754234314, + "learning_rate": 1.6975945017182132e-05, + "loss": 0.2924, + "step": 229 + }, + { + "epoch": 0.47337278106508873, + "grad_norm": 0.17431536316871643, + "learning_rate": 1.695303550973654e-05, + "loss": 0.2792, + "step": 230 + }, + { + "epoch": 0.4754309235914587, + "grad_norm": 0.164026141166687, + "learning_rate": 1.6930126002290953e-05, + "loss": 0.2825, + "step": 231 + }, + { + "epoch": 0.47748906611782865, + "grad_norm": 0.16449657082557678, + "learning_rate": 1.6907216494845362e-05, + "loss": 0.2831, + "step": 232 + }, + { + "epoch": 0.4795472086441986, + "grad_norm": 0.1812741607427597, + "learning_rate": 1.688430698739977e-05, + "loss": 0.2849, + "step": 233 + }, + { + "epoch": 0.4816053511705686, + "grad_norm": 0.18431834876537323, + "learning_rate": 1.6861397479954183e-05, + "loss": 0.2802, + "step": 234 + }, + { + "epoch": 0.4836634936969385, + "grad_norm": 0.18349015712738037, + "learning_rate": 1.6838487972508592e-05, + "loss": 0.2804, + "step": 235 + }, + { + "epoch": 0.48572163622330844, + "grad_norm": 0.1769968420267105, + "learning_rate": 1.6815578465063e-05, + "loss": 0.2777, + "step": 236 + }, + { + "epoch": 0.4877797787496784, + "grad_norm": 0.17207500338554382, + "learning_rate": 1.6792668957617413e-05, + "loss": 0.2883, + "step": 237 + }, + { + "epoch": 0.48983792127604836, + "grad_norm": 0.1729692667722702, + "learning_rate": 1.6769759450171822e-05, + "loss": 0.2784, + "step": 238 + }, + { + "epoch": 0.4918960638024183, + "grad_norm": 0.17234881222248077, + "learning_rate": 1.6746849942726235e-05, + "loss": 0.2816, + "step": 239 + }, + { + "epoch": 0.4939542063287883, + "grad_norm": 0.17132551968097687, + "learning_rate": 1.6723940435280644e-05, + "loss": 0.2812, + "step": 240 + }, + { + "epoch": 0.4960123488551582, + "grad_norm": 0.1752254068851471, + "learning_rate": 1.6701030927835052e-05, + "loss": 0.2799, + "step": 241 + }, + { + "epoch": 0.49807049138152815, + "grad_norm": 0.1768665313720703, + "learning_rate": 1.6678121420389465e-05, + "loss": 0.2966, + "step": 242 + }, + { + "epoch": 0.5001286339078981, + "grad_norm": 0.18139514327049255, + "learning_rate": 1.6655211912943874e-05, + "loss": 0.2816, + "step": 243 + }, + { + "epoch": 0.5021867764342681, + "grad_norm": 0.17312943935394287, + "learning_rate": 1.6632302405498283e-05, + "loss": 0.2845, + "step": 244 + }, + { + "epoch": 0.5042449189606381, + "grad_norm": 0.17966389656066895, + "learning_rate": 1.6609392898052695e-05, + "loss": 0.2864, + "step": 245 + }, + { + "epoch": 0.506303061487008, + "grad_norm": 0.16653811931610107, + "learning_rate": 1.6586483390607104e-05, + "loss": 0.2759, + "step": 246 + }, + { + "epoch": 0.5083612040133779, + "grad_norm": 0.1634613424539566, + "learning_rate": 1.6563573883161516e-05, + "loss": 0.2728, + "step": 247 + }, + { + "epoch": 0.5104193465397479, + "grad_norm": 0.17358507215976715, + "learning_rate": 1.654066437571592e-05, + "loss": 0.2706, + "step": 248 + }, + { + "epoch": 0.5124774890661178, + "grad_norm": 0.17524316906929016, + "learning_rate": 1.6517754868270334e-05, + "loss": 0.2805, + "step": 249 + }, + { + "epoch": 0.5145356315924878, + "grad_norm": 0.18134094774723053, + "learning_rate": 1.6494845360824743e-05, + "loss": 0.2909, + "step": 250 + }, + { + "epoch": 0.5165937741188578, + "grad_norm": 0.17795510590076447, + "learning_rate": 1.647193585337915e-05, + "loss": 0.2889, + "step": 251 + }, + { + "epoch": 0.5186519166452277, + "grad_norm": 0.16782547533512115, + "learning_rate": 1.6449026345933564e-05, + "loss": 0.2842, + "step": 252 + }, + { + "epoch": 0.5207100591715976, + "grad_norm": 0.17360062897205353, + "learning_rate": 1.6426116838487973e-05, + "loss": 0.2763, + "step": 253 + }, + { + "epoch": 0.5227682016979676, + "grad_norm": 0.17241406440734863, + "learning_rate": 1.6403207331042385e-05, + "loss": 0.2753, + "step": 254 + }, + { + "epoch": 0.5248263442243375, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.6380297823596794e-05, + "loss": 0.2732, + "step": 255 + }, + { + "epoch": 0.5268844867507075, + "grad_norm": 0.1807374209165573, + "learning_rate": 1.6357388316151203e-05, + "loss": 0.2856, + "step": 256 + }, + { + "epoch": 0.5289426292770775, + "grad_norm": 0.1749904304742813, + "learning_rate": 1.6334478808705615e-05, + "loss": 0.285, + "step": 257 + }, + { + "epoch": 0.5310007718034474, + "grad_norm": 0.16673170030117035, + "learning_rate": 1.6311569301260024e-05, + "loss": 0.2825, + "step": 258 + }, + { + "epoch": 0.5330589143298173, + "grad_norm": 0.17239685356616974, + "learning_rate": 1.6288659793814433e-05, + "loss": 0.2845, + "step": 259 + }, + { + "epoch": 0.5351170568561873, + "grad_norm": 0.1831504851579666, + "learning_rate": 1.6265750286368845e-05, + "loss": 0.2859, + "step": 260 + }, + { + "epoch": 0.5371751993825572, + "grad_norm": 0.18507827818393707, + "learning_rate": 1.6242840778923254e-05, + "loss": 0.293, + "step": 261 + }, + { + "epoch": 0.5392333419089272, + "grad_norm": 0.16738134622573853, + "learning_rate": 1.6219931271477663e-05, + "loss": 0.2853, + "step": 262 + }, + { + "epoch": 0.5412914844352972, + "grad_norm": 0.1701226830482483, + "learning_rate": 1.6197021764032075e-05, + "loss": 0.2763, + "step": 263 + }, + { + "epoch": 0.5433496269616671, + "grad_norm": 0.18195705115795135, + "learning_rate": 1.6174112256586484e-05, + "loss": 0.2797, + "step": 264 + }, + { + "epoch": 0.545407769488037, + "grad_norm": 0.1832309514284134, + "learning_rate": 1.6151202749140896e-05, + "loss": 0.2885, + "step": 265 + }, + { + "epoch": 0.547465912014407, + "grad_norm": 0.1773810088634491, + "learning_rate": 1.6128293241695305e-05, + "loss": 0.2682, + "step": 266 + }, + { + "epoch": 0.5495240545407769, + "grad_norm": 0.16989603638648987, + "learning_rate": 1.6105383734249714e-05, + "loss": 0.2821, + "step": 267 + }, + { + "epoch": 0.551582197067147, + "grad_norm": 0.17835170030593872, + "learning_rate": 1.6082474226804127e-05, + "loss": 0.2774, + "step": 268 + }, + { + "epoch": 0.5536403395935169, + "grad_norm": 0.1777082234621048, + "learning_rate": 1.6059564719358535e-05, + "loss": 0.2726, + "step": 269 + }, + { + "epoch": 0.5556984821198868, + "grad_norm": 0.18766450881958008, + "learning_rate": 1.6036655211912944e-05, + "loss": 0.2879, + "step": 270 + }, + { + "epoch": 0.5577566246462567, + "grad_norm": 0.1868186593055725, + "learning_rate": 1.6013745704467357e-05, + "loss": 0.2808, + "step": 271 + }, + { + "epoch": 0.5598147671726267, + "grad_norm": 0.16695882380008698, + "learning_rate": 1.5990836197021766e-05, + "loss": 0.2668, + "step": 272 + }, + { + "epoch": 0.5618729096989966, + "grad_norm": 0.17224495112895966, + "learning_rate": 1.5967926689576178e-05, + "loss": 0.2682, + "step": 273 + }, + { + "epoch": 0.5639310522253667, + "grad_norm": 0.20116423070430756, + "learning_rate": 1.5945017182130587e-05, + "loss": 0.276, + "step": 274 + }, + { + "epoch": 0.5659891947517366, + "grad_norm": 0.19478343427181244, + "learning_rate": 1.5922107674684996e-05, + "loss": 0.2854, + "step": 275 + }, + { + "epoch": 0.5680473372781065, + "grad_norm": 0.20242950320243835, + "learning_rate": 1.5899198167239405e-05, + "loss": 0.2854, + "step": 276 + }, + { + "epoch": 0.5701054798044765, + "grad_norm": 0.19146093726158142, + "learning_rate": 1.5876288659793813e-05, + "loss": 0.2817, + "step": 277 + }, + { + "epoch": 0.5721636223308464, + "grad_norm": 0.1804896742105484, + "learning_rate": 1.5853379152348226e-05, + "loss": 0.2714, + "step": 278 + }, + { + "epoch": 0.5742217648572163, + "grad_norm": 0.19315646588802338, + "learning_rate": 1.5830469644902635e-05, + "loss": 0.2703, + "step": 279 + }, + { + "epoch": 0.5762799073835864, + "grad_norm": 0.1910266876220703, + "learning_rate": 1.5807560137457047e-05, + "loss": 0.2728, + "step": 280 + }, + { + "epoch": 0.5783380499099563, + "grad_norm": 0.20330773293972015, + "learning_rate": 1.5784650630011456e-05, + "loss": 0.2717, + "step": 281 + }, + { + "epoch": 0.5803961924363262, + "grad_norm": 0.19080683588981628, + "learning_rate": 1.5761741122565865e-05, + "loss": 0.2679, + "step": 282 + }, + { + "epoch": 0.5824543349626962, + "grad_norm": 0.18052135407924652, + "learning_rate": 1.5738831615120277e-05, + "loss": 0.2815, + "step": 283 + }, + { + "epoch": 0.5845124774890661, + "grad_norm": 0.1998361051082611, + "learning_rate": 1.5715922107674686e-05, + "loss": 0.2888, + "step": 284 + }, + { + "epoch": 0.586570620015436, + "grad_norm": 0.1978764683008194, + "learning_rate": 1.5693012600229095e-05, + "loss": 0.2926, + "step": 285 + }, + { + "epoch": 0.5886287625418061, + "grad_norm": 0.17189203202724457, + "learning_rate": 1.5670103092783507e-05, + "loss": 0.2674, + "step": 286 + }, + { + "epoch": 0.590686905068176, + "grad_norm": 0.1937166303396225, + "learning_rate": 1.5647193585337916e-05, + "loss": 0.2838, + "step": 287 + }, + { + "epoch": 0.5927450475945459, + "grad_norm": 0.18978627026081085, + "learning_rate": 1.5624284077892328e-05, + "loss": 0.273, + "step": 288 + }, + { + "epoch": 0.5948031901209159, + "grad_norm": 0.17718705534934998, + "learning_rate": 1.5601374570446737e-05, + "loss": 0.2842, + "step": 289 + }, + { + "epoch": 0.5968613326472858, + "grad_norm": 0.1912536770105362, + "learning_rate": 1.5578465063001146e-05, + "loss": 0.2736, + "step": 290 + }, + { + "epoch": 0.5989194751736557, + "grad_norm": 0.18104907870292664, + "learning_rate": 1.555555555555556e-05, + "loss": 0.274, + "step": 291 + }, + { + "epoch": 0.6009776177000258, + "grad_norm": 0.1620381772518158, + "learning_rate": 1.5532646048109967e-05, + "loss": 0.2663, + "step": 292 + }, + { + "epoch": 0.6030357602263957, + "grad_norm": 0.17973916232585907, + "learning_rate": 1.5509736540664376e-05, + "loss": 0.2791, + "step": 293 + }, + { + "epoch": 0.6050939027527656, + "grad_norm": 0.16821186244487762, + "learning_rate": 1.548682703321879e-05, + "loss": 0.2787, + "step": 294 + }, + { + "epoch": 0.6071520452791356, + "grad_norm": 0.18426693975925446, + "learning_rate": 1.5463917525773197e-05, + "loss": 0.2886, + "step": 295 + }, + { + "epoch": 0.6092101878055055, + "grad_norm": 0.19796033203601837, + "learning_rate": 1.5441008018327606e-05, + "loss": 0.268, + "step": 296 + }, + { + "epoch": 0.6112683303318754, + "grad_norm": 0.1971343755722046, + "learning_rate": 1.541809851088202e-05, + "loss": 0.2761, + "step": 297 + }, + { + "epoch": 0.6133264728582455, + "grad_norm": 0.17458567023277283, + "learning_rate": 1.5395189003436427e-05, + "loss": 0.2831, + "step": 298 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.17610400915145874, + "learning_rate": 1.537227949599084e-05, + "loss": 0.2691, + "step": 299 + }, + { + "epoch": 0.6174427579109854, + "grad_norm": 0.1929042488336563, + "learning_rate": 1.534936998854525e-05, + "loss": 0.2847, + "step": 300 + }, + { + "epoch": 0.6174427579109854, + "eval_loss": 0.2959522604942322, + "eval_runtime": 2428.6339, + "eval_samples_per_second": 3.201, + "eval_steps_per_second": 0.8, + "step": 300 + }, + { + "epoch": 0.6195009004373553, + "grad_norm": 0.19430233538150787, + "learning_rate": 1.5326460481099657e-05, + "loss": 0.279, + "step": 301 + }, + { + "epoch": 0.6215590429637252, + "grad_norm": 0.18542642891407013, + "learning_rate": 1.5303550973654066e-05, + "loss": 0.2695, + "step": 302 + }, + { + "epoch": 0.6236171854900951, + "grad_norm": 0.1850169450044632, + "learning_rate": 1.5280641466208475e-05, + "loss": 0.2847, + "step": 303 + }, + { + "epoch": 0.6256753280164652, + "grad_norm": 0.18449267745018005, + "learning_rate": 1.5257731958762888e-05, + "loss": 0.2804, + "step": 304 + }, + { + "epoch": 0.6277334705428351, + "grad_norm": 0.18608458340168, + "learning_rate": 1.5234822451317296e-05, + "loss": 0.2792, + "step": 305 + }, + { + "epoch": 0.6297916130692051, + "grad_norm": 0.21136076748371124, + "learning_rate": 1.5211912943871707e-05, + "loss": 0.2829, + "step": 306 + }, + { + "epoch": 0.631849755595575, + "grad_norm": 0.19672206044197083, + "learning_rate": 1.5189003436426118e-05, + "loss": 0.2854, + "step": 307 + }, + { + "epoch": 0.6339078981219449, + "grad_norm": 0.1834034025669098, + "learning_rate": 1.5166093928980528e-05, + "loss": 0.2775, + "step": 308 + }, + { + "epoch": 0.6359660406483149, + "grad_norm": 0.18414819240570068, + "learning_rate": 1.5143184421534937e-05, + "loss": 0.2794, + "step": 309 + }, + { + "epoch": 0.6380241831746849, + "grad_norm": 0.1890152245759964, + "learning_rate": 1.5120274914089348e-05, + "loss": 0.2718, + "step": 310 + }, + { + "epoch": 0.6400823257010548, + "grad_norm": 0.18923887610435486, + "learning_rate": 1.5097365406643758e-05, + "loss": 0.2795, + "step": 311 + }, + { + "epoch": 0.6421404682274248, + "grad_norm": 0.20047079026699066, + "learning_rate": 1.5074455899198169e-05, + "loss": 0.2811, + "step": 312 + }, + { + "epoch": 0.6441986107537947, + "grad_norm": 0.1910201609134674, + "learning_rate": 1.5051546391752578e-05, + "loss": 0.2732, + "step": 313 + }, + { + "epoch": 0.6462567532801646, + "grad_norm": 0.2021956443786621, + "learning_rate": 1.5028636884306988e-05, + "loss": 0.2806, + "step": 314 + }, + { + "epoch": 0.6483148958065346, + "grad_norm": 0.18957914412021637, + "learning_rate": 1.5005727376861399e-05, + "loss": 0.2681, + "step": 315 + }, + { + "epoch": 0.6503730383329046, + "grad_norm": 0.19858811795711517, + "learning_rate": 1.498281786941581e-05, + "loss": 0.2805, + "step": 316 + }, + { + "epoch": 0.6524311808592745, + "grad_norm": 0.1731935292482376, + "learning_rate": 1.4959908361970218e-05, + "loss": 0.2646, + "step": 317 + }, + { + "epoch": 0.6544893233856445, + "grad_norm": 0.19619058072566986, + "learning_rate": 1.4936998854524629e-05, + "loss": 0.2965, + "step": 318 + }, + { + "epoch": 0.6565474659120144, + "grad_norm": 0.18745696544647217, + "learning_rate": 1.491408934707904e-05, + "loss": 0.2766, + "step": 319 + }, + { + "epoch": 0.6586056084383843, + "grad_norm": 0.18006449937820435, + "learning_rate": 1.489117983963345e-05, + "loss": 0.2788, + "step": 320 + }, + { + "epoch": 0.6606637509647543, + "grad_norm": 0.17593689262866974, + "learning_rate": 1.486827033218786e-05, + "loss": 0.2813, + "step": 321 + }, + { + "epoch": 0.6627218934911243, + "grad_norm": 0.18695640563964844, + "learning_rate": 1.484536082474227e-05, + "loss": 0.281, + "step": 322 + }, + { + "epoch": 0.6647800360174942, + "grad_norm": 0.17909488081932068, + "learning_rate": 1.482245131729668e-05, + "loss": 0.2814, + "step": 323 + }, + { + "epoch": 0.6668381785438642, + "grad_norm": 0.19074076414108276, + "learning_rate": 1.4799541809851091e-05, + "loss": 0.2721, + "step": 324 + }, + { + "epoch": 0.6688963210702341, + "grad_norm": 0.19175754487514496, + "learning_rate": 1.47766323024055e-05, + "loss": 0.2754, + "step": 325 + }, + { + "epoch": 0.670954463596604, + "grad_norm": 0.18646575510501862, + "learning_rate": 1.475372279495991e-05, + "loss": 0.2678, + "step": 326 + }, + { + "epoch": 0.673012606122974, + "grad_norm": 0.18553243577480316, + "learning_rate": 1.4730813287514321e-05, + "loss": 0.281, + "step": 327 + }, + { + "epoch": 0.675070748649344, + "grad_norm": 0.17120976746082306, + "learning_rate": 1.470790378006873e-05, + "loss": 0.2691, + "step": 328 + }, + { + "epoch": 0.677128891175714, + "grad_norm": 0.19170524179935455, + "learning_rate": 1.4684994272623139e-05, + "loss": 0.2685, + "step": 329 + }, + { + "epoch": 0.6791870337020839, + "grad_norm": 0.1851339191198349, + "learning_rate": 1.466208476517755e-05, + "loss": 0.266, + "step": 330 + }, + { + "epoch": 0.6812451762284538, + "grad_norm": 0.1678062081336975, + "learning_rate": 1.4639175257731958e-05, + "loss": 0.2609, + "step": 331 + }, + { + "epoch": 0.6833033187548238, + "grad_norm": 0.17913252115249634, + "learning_rate": 1.4616265750286369e-05, + "loss": 0.2716, + "step": 332 + }, + { + "epoch": 0.6853614612811937, + "grad_norm": 0.1859239637851715, + "learning_rate": 1.459335624284078e-05, + "loss": 0.2712, + "step": 333 + }, + { + "epoch": 0.6874196038075637, + "grad_norm": 0.18390226364135742, + "learning_rate": 1.457044673539519e-05, + "loss": 0.2827, + "step": 334 + }, + { + "epoch": 0.6894777463339337, + "grad_norm": 0.18520398437976837, + "learning_rate": 1.4547537227949599e-05, + "loss": 0.2721, + "step": 335 + }, + { + "epoch": 0.6915358888603036, + "grad_norm": 0.18416717648506165, + "learning_rate": 1.452462772050401e-05, + "loss": 0.2683, + "step": 336 + }, + { + "epoch": 0.6935940313866735, + "grad_norm": 0.18727894127368927, + "learning_rate": 1.450171821305842e-05, + "loss": 0.2733, + "step": 337 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.18597093224525452, + "learning_rate": 1.447880870561283e-05, + "loss": 0.2708, + "step": 338 + }, + { + "epoch": 0.6977103164394134, + "grad_norm": 0.1786068081855774, + "learning_rate": 1.445589919816724e-05, + "loss": 0.2667, + "step": 339 + }, + { + "epoch": 0.6997684589657834, + "grad_norm": 0.17466600239276886, + "learning_rate": 1.443298969072165e-05, + "loss": 0.2786, + "step": 340 + }, + { + "epoch": 0.7018266014921534, + "grad_norm": 0.185857355594635, + "learning_rate": 1.4410080183276061e-05, + "loss": 0.2759, + "step": 341 + }, + { + "epoch": 0.7038847440185233, + "grad_norm": 0.2004527747631073, + "learning_rate": 1.4387170675830471e-05, + "loss": 0.2847, + "step": 342 + }, + { + "epoch": 0.7059428865448932, + "grad_norm": 0.18774060904979706, + "learning_rate": 1.436426116838488e-05, + "loss": 0.2766, + "step": 343 + }, + { + "epoch": 0.7080010290712632, + "grad_norm": 0.1840328425168991, + "learning_rate": 1.4341351660939291e-05, + "loss": 0.2722, + "step": 344 + }, + { + "epoch": 0.7100591715976331, + "grad_norm": 0.19089624285697937, + "learning_rate": 1.4318442153493702e-05, + "loss": 0.2779, + "step": 345 + }, + { + "epoch": 0.7121173141240031, + "grad_norm": 0.1848018616437912, + "learning_rate": 1.4295532646048112e-05, + "loss": 0.2739, + "step": 346 + }, + { + "epoch": 0.7141754566503731, + "grad_norm": 0.18844038248062134, + "learning_rate": 1.4272623138602521e-05, + "loss": 0.27, + "step": 347 + }, + { + "epoch": 0.716233599176743, + "grad_norm": 0.19289302825927734, + "learning_rate": 1.4249713631156932e-05, + "loss": 0.2743, + "step": 348 + }, + { + "epoch": 0.7182917417031129, + "grad_norm": 0.18738920986652374, + "learning_rate": 1.4226804123711342e-05, + "loss": 0.2657, + "step": 349 + }, + { + "epoch": 0.7203498842294829, + "grad_norm": 0.1925181746482849, + "learning_rate": 1.4203894616265753e-05, + "loss": 0.2637, + "step": 350 + }, + { + "epoch": 0.7224080267558528, + "grad_norm": 0.19114750623703003, + "learning_rate": 1.4180985108820162e-05, + "loss": 0.2758, + "step": 351 + }, + { + "epoch": 0.7244661692822228, + "grad_norm": 0.18310120701789856, + "learning_rate": 1.4158075601374572e-05, + "loss": 0.2777, + "step": 352 + }, + { + "epoch": 0.7265243118085928, + "grad_norm": 0.2045605331659317, + "learning_rate": 1.4135166093928983e-05, + "loss": 0.2653, + "step": 353 + }, + { + "epoch": 0.7285824543349627, + "grad_norm": 0.1856454759836197, + "learning_rate": 1.4112256586483393e-05, + "loss": 0.267, + "step": 354 + }, + { + "epoch": 0.7306405968613326, + "grad_norm": 0.1855366826057434, + "learning_rate": 1.4089347079037802e-05, + "loss": 0.2805, + "step": 355 + }, + { + "epoch": 0.7326987393877026, + "grad_norm": 0.17913414537906647, + "learning_rate": 1.4066437571592213e-05, + "loss": 0.2755, + "step": 356 + }, + { + "epoch": 0.7347568819140725, + "grad_norm": 0.2057684361934662, + "learning_rate": 1.404352806414662e-05, + "loss": 0.2668, + "step": 357 + }, + { + "epoch": 0.7368150244404424, + "grad_norm": 0.190156951546669, + "learning_rate": 1.402061855670103e-05, + "loss": 0.2778, + "step": 358 + }, + { + "epoch": 0.7388731669668125, + "grad_norm": 0.19387219846248627, + "learning_rate": 1.3997709049255441e-05, + "loss": 0.2785, + "step": 359 + }, + { + "epoch": 0.7409313094931824, + "grad_norm": 0.1933836042881012, + "learning_rate": 1.3974799541809852e-05, + "loss": 0.2661, + "step": 360 + }, + { + "epoch": 0.7429894520195524, + "grad_norm": 0.19618812203407288, + "learning_rate": 1.3951890034364261e-05, + "loss": 0.2622, + "step": 361 + }, + { + "epoch": 0.7450475945459223, + "grad_norm": 0.18786942958831787, + "learning_rate": 1.3928980526918671e-05, + "loss": 0.2695, + "step": 362 + }, + { + "epoch": 0.7471057370722922, + "grad_norm": 0.19361330568790436, + "learning_rate": 1.3906071019473082e-05, + "loss": 0.2869, + "step": 363 + }, + { + "epoch": 0.7491638795986622, + "grad_norm": 0.19813291728496552, + "learning_rate": 1.3883161512027493e-05, + "loss": 0.2753, + "step": 364 + }, + { + "epoch": 0.7512220221250322, + "grad_norm": 0.1891734004020691, + "learning_rate": 1.3860252004581902e-05, + "loss": 0.2694, + "step": 365 + }, + { + "epoch": 0.7532801646514021, + "grad_norm": 0.18902742862701416, + "learning_rate": 1.3837342497136312e-05, + "loss": 0.2675, + "step": 366 + }, + { + "epoch": 0.7553383071777721, + "grad_norm": 0.19838480651378632, + "learning_rate": 1.3814432989690723e-05, + "loss": 0.2721, + "step": 367 + }, + { + "epoch": 0.757396449704142, + "grad_norm": 0.20880939066410065, + "learning_rate": 1.3791523482245133e-05, + "loss": 0.2641, + "step": 368 + }, + { + "epoch": 0.7594545922305119, + "grad_norm": 0.20068003237247467, + "learning_rate": 1.3768613974799542e-05, + "loss": 0.2945, + "step": 369 + }, + { + "epoch": 0.7615127347568819, + "grad_norm": 0.19780132174491882, + "learning_rate": 1.3745704467353953e-05, + "loss": 0.2687, + "step": 370 + }, + { + "epoch": 0.7635708772832519, + "grad_norm": 0.19194689393043518, + "learning_rate": 1.3722794959908363e-05, + "loss": 0.2731, + "step": 371 + }, + { + "epoch": 0.7656290198096218, + "grad_norm": 0.19504573941230774, + "learning_rate": 1.3699885452462774e-05, + "loss": 0.2551, + "step": 372 + }, + { + "epoch": 0.7676871623359918, + "grad_norm": 0.18304413557052612, + "learning_rate": 1.3676975945017183e-05, + "loss": 0.2692, + "step": 373 + }, + { + "epoch": 0.7697453048623617, + "grad_norm": 0.2051483392715454, + "learning_rate": 1.3654066437571593e-05, + "loss": 0.2791, + "step": 374 + }, + { + "epoch": 0.7718034473887316, + "grad_norm": 0.18748973309993744, + "learning_rate": 1.3631156930126004e-05, + "loss": 0.2671, + "step": 375 + }, + { + "epoch": 0.7738615899151016, + "grad_norm": 0.19167177379131317, + "learning_rate": 1.3608247422680415e-05, + "loss": 0.2766, + "step": 376 + }, + { + "epoch": 0.7759197324414716, + "grad_norm": 0.17931750416755676, + "learning_rate": 1.3585337915234824e-05, + "loss": 0.2748, + "step": 377 + }, + { + "epoch": 0.7779778749678415, + "grad_norm": 0.19437509775161743, + "learning_rate": 1.3562428407789234e-05, + "loss": 0.2667, + "step": 378 + }, + { + "epoch": 0.7800360174942115, + "grad_norm": 0.19813868403434753, + "learning_rate": 1.3539518900343645e-05, + "loss": 0.2771, + "step": 379 + }, + { + "epoch": 0.7820941600205814, + "grad_norm": 0.19205260276794434, + "learning_rate": 1.3516609392898055e-05, + "loss": 0.2703, + "step": 380 + }, + { + "epoch": 0.7841523025469513, + "grad_norm": 0.19039763510227203, + "learning_rate": 1.3493699885452464e-05, + "loss": 0.264, + "step": 381 + }, + { + "epoch": 0.7862104450733213, + "grad_norm": 0.18269500136375427, + "learning_rate": 1.3470790378006875e-05, + "loss": 0.2653, + "step": 382 + }, + { + "epoch": 0.7882685875996913, + "grad_norm": 0.1922067403793335, + "learning_rate": 1.3447880870561285e-05, + "loss": 0.2754, + "step": 383 + }, + { + "epoch": 0.7903267301260612, + "grad_norm": 0.19615666568279266, + "learning_rate": 1.3424971363115693e-05, + "loss": 0.2811, + "step": 384 + }, + { + "epoch": 0.7923848726524312, + "grad_norm": 0.19037973880767822, + "learning_rate": 1.3402061855670103e-05, + "loss": 0.2673, + "step": 385 + }, + { + "epoch": 0.7944430151788011, + "grad_norm": 0.191124826669693, + "learning_rate": 1.3379152348224514e-05, + "loss": 0.2683, + "step": 386 + }, + { + "epoch": 0.796501157705171, + "grad_norm": 0.18429923057556152, + "learning_rate": 1.3356242840778923e-05, + "loss": 0.2698, + "step": 387 + }, + { + "epoch": 0.798559300231541, + "grad_norm": 0.1839045137166977, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.2895, + "step": 388 + }, + { + "epoch": 0.800617442757911, + "grad_norm": 0.1944131702184677, + "learning_rate": 1.3310423825887744e-05, + "loss": 0.2641, + "step": 389 + }, + { + "epoch": 0.802675585284281, + "grad_norm": 0.20407740771770477, + "learning_rate": 1.3287514318442154e-05, + "loss": 0.2743, + "step": 390 + }, + { + "epoch": 0.8047337278106509, + "grad_norm": 0.1814037561416626, + "learning_rate": 1.3264604810996563e-05, + "loss": 0.2672, + "step": 391 + }, + { + "epoch": 0.8067918703370208, + "grad_norm": 0.1886950582265854, + "learning_rate": 1.3241695303550974e-05, + "loss": 0.2725, + "step": 392 + }, + { + "epoch": 0.8088500128633908, + "grad_norm": 0.19429941475391388, + "learning_rate": 1.3218785796105385e-05, + "loss": 0.2669, + "step": 393 + }, + { + "epoch": 0.8109081553897607, + "grad_norm": 0.19143058359622955, + "learning_rate": 1.3195876288659795e-05, + "loss": 0.2659, + "step": 394 + }, + { + "epoch": 0.8129662979161307, + "grad_norm": 0.2213468849658966, + "learning_rate": 1.3172966781214204e-05, + "loss": 0.2764, + "step": 395 + }, + { + "epoch": 0.8150244404425007, + "grad_norm": 0.2040800005197525, + "learning_rate": 1.3150057273768615e-05, + "loss": 0.2783, + "step": 396 + }, + { + "epoch": 0.8170825829688706, + "grad_norm": 0.1948375254869461, + "learning_rate": 1.3127147766323025e-05, + "loss": 0.2689, + "step": 397 + }, + { + "epoch": 0.8191407254952405, + "grad_norm": 0.1915021538734436, + "learning_rate": 1.3104238258877436e-05, + "loss": 0.2808, + "step": 398 + }, + { + "epoch": 0.8211988680216105, + "grad_norm": 0.19760248064994812, + "learning_rate": 1.3081328751431845e-05, + "loss": 0.2712, + "step": 399 + }, + { + "epoch": 0.8232570105479804, + "grad_norm": 0.2082677185535431, + "learning_rate": 1.3058419243986255e-05, + "loss": 0.2707, + "step": 400 + }, + { + "epoch": 0.8232570105479804, + "eval_loss": 0.28778496384620667, + "eval_runtime": 2427.9096, + "eval_samples_per_second": 3.202, + "eval_steps_per_second": 0.801, + "step": 400 + }, + { + "epoch": 0.8253151530743504, + "grad_norm": 0.19694332778453827, + "learning_rate": 1.3035509736540666e-05, + "loss": 0.2801, + "step": 401 + }, + { + "epoch": 0.8273732956007204, + "grad_norm": 0.19448824226856232, + "learning_rate": 1.3012600229095077e-05, + "loss": 0.2632, + "step": 402 + }, + { + "epoch": 0.8294314381270903, + "grad_norm": 0.18745476007461548, + "learning_rate": 1.2989690721649485e-05, + "loss": 0.2773, + "step": 403 + }, + { + "epoch": 0.8314895806534602, + "grad_norm": 0.19524575769901276, + "learning_rate": 1.2966781214203896e-05, + "loss": 0.2594, + "step": 404 + }, + { + "epoch": 0.8335477231798302, + "grad_norm": 0.19612252712249756, + "learning_rate": 1.2943871706758307e-05, + "loss": 0.271, + "step": 405 + }, + { + "epoch": 0.8356058657062001, + "grad_norm": 0.19964493811130524, + "learning_rate": 1.2920962199312717e-05, + "loss": 0.2615, + "step": 406 + }, + { + "epoch": 0.8376640082325701, + "grad_norm": 0.20115099847316742, + "learning_rate": 1.2898052691867126e-05, + "loss": 0.269, + "step": 407 + }, + { + "epoch": 0.8397221507589401, + "grad_norm": 0.18949687480926514, + "learning_rate": 1.2875143184421537e-05, + "loss": 0.2649, + "step": 408 + }, + { + "epoch": 0.84178029328531, + "grad_norm": 0.1931927353143692, + "learning_rate": 1.2852233676975947e-05, + "loss": 0.2611, + "step": 409 + }, + { + "epoch": 0.8438384358116799, + "grad_norm": 0.18723614513874054, + "learning_rate": 1.2829324169530358e-05, + "loss": 0.2699, + "step": 410 + }, + { + "epoch": 0.8458965783380499, + "grad_norm": 0.19405977427959442, + "learning_rate": 1.2806414662084765e-05, + "loss": 0.2691, + "step": 411 + }, + { + "epoch": 0.8479547208644198, + "grad_norm": 0.2021879404783249, + "learning_rate": 1.2783505154639176e-05, + "loss": 0.267, + "step": 412 + }, + { + "epoch": 0.8500128633907899, + "grad_norm": 0.20015574991703033, + "learning_rate": 1.2760595647193586e-05, + "loss": 0.2632, + "step": 413 + }, + { + "epoch": 0.8520710059171598, + "grad_norm": 0.19090059399604797, + "learning_rate": 1.2737686139747995e-05, + "loss": 0.2743, + "step": 414 + }, + { + "epoch": 0.8541291484435297, + "grad_norm": 0.1906920224428177, + "learning_rate": 1.2714776632302406e-05, + "loss": 0.2723, + "step": 415 + }, + { + "epoch": 0.8561872909698997, + "grad_norm": 0.19348129630088806, + "learning_rate": 1.2691867124856816e-05, + "loss": 0.2656, + "step": 416 + }, + { + "epoch": 0.8582454334962696, + "grad_norm": 0.18771213293075562, + "learning_rate": 1.2668957617411227e-05, + "loss": 0.2617, + "step": 417 + }, + { + "epoch": 0.8603035760226395, + "grad_norm": 0.2135135382413864, + "learning_rate": 1.2646048109965636e-05, + "loss": 0.2773, + "step": 418 + }, + { + "epoch": 0.8623617185490096, + "grad_norm": 0.19689443707466125, + "learning_rate": 1.2623138602520046e-05, + "loss": 0.2623, + "step": 419 + }, + { + "epoch": 0.8644198610753795, + "grad_norm": 0.18752440810203552, + "learning_rate": 1.2600229095074457e-05, + "loss": 0.2599, + "step": 420 + }, + { + "epoch": 0.8664780036017494, + "grad_norm": 0.19264395534992218, + "learning_rate": 1.2577319587628866e-05, + "loss": 0.2707, + "step": 421 + }, + { + "epoch": 0.8685361461281194, + "grad_norm": 0.19980797171592712, + "learning_rate": 1.2554410080183277e-05, + "loss": 0.2616, + "step": 422 + }, + { + "epoch": 0.8705942886544893, + "grad_norm": 0.22940242290496826, + "learning_rate": 1.2531500572737687e-05, + "loss": 0.2712, + "step": 423 + }, + { + "epoch": 0.8726524311808592, + "grad_norm": 0.18825359642505646, + "learning_rate": 1.2508591065292098e-05, + "loss": 0.2779, + "step": 424 + }, + { + "epoch": 0.8747105737072293, + "grad_norm": 0.21553562581539154, + "learning_rate": 1.2485681557846507e-05, + "loss": 0.2677, + "step": 425 + }, + { + "epoch": 0.8767687162335992, + "grad_norm": 0.2025568038225174, + "learning_rate": 1.2462772050400917e-05, + "loss": 0.2659, + "step": 426 + }, + { + "epoch": 0.8788268587599691, + "grad_norm": 0.19179950654506683, + "learning_rate": 1.2439862542955328e-05, + "loss": 0.2762, + "step": 427 + }, + { + "epoch": 0.8808850012863391, + "grad_norm": 0.20982210338115692, + "learning_rate": 1.2416953035509738e-05, + "loss": 0.2648, + "step": 428 + }, + { + "epoch": 0.882943143812709, + "grad_norm": 0.2084280252456665, + "learning_rate": 1.2394043528064147e-05, + "loss": 0.2806, + "step": 429 + }, + { + "epoch": 0.8850012863390789, + "grad_norm": 0.1993308663368225, + "learning_rate": 1.2371134020618558e-05, + "loss": 0.2673, + "step": 430 + }, + { + "epoch": 0.887059428865449, + "grad_norm": 0.1917535811662674, + "learning_rate": 1.2348224513172968e-05, + "loss": 0.2596, + "step": 431 + }, + { + "epoch": 0.8891175713918189, + "grad_norm": 0.18980742990970612, + "learning_rate": 1.2325315005727379e-05, + "loss": 0.2607, + "step": 432 + }, + { + "epoch": 0.8911757139181888, + "grad_norm": 0.21062685549259186, + "learning_rate": 1.2302405498281788e-05, + "loss": 0.2612, + "step": 433 + }, + { + "epoch": 0.8932338564445588, + "grad_norm": 0.20591405034065247, + "learning_rate": 1.2279495990836199e-05, + "loss": 0.2698, + "step": 434 + }, + { + "epoch": 0.8952919989709287, + "grad_norm": 0.2052398920059204, + "learning_rate": 1.2256586483390609e-05, + "loss": 0.2673, + "step": 435 + }, + { + "epoch": 0.8973501414972986, + "grad_norm": 0.19963452219963074, + "learning_rate": 1.223367697594502e-05, + "loss": 0.266, + "step": 436 + }, + { + "epoch": 0.8994082840236687, + "grad_norm": 0.1929163783788681, + "learning_rate": 1.2210767468499429e-05, + "loss": 0.2605, + "step": 437 + }, + { + "epoch": 0.9014664265500386, + "grad_norm": 0.19121681153774261, + "learning_rate": 1.218785796105384e-05, + "loss": 0.2642, + "step": 438 + }, + { + "epoch": 0.9035245690764085, + "grad_norm": 0.18931221961975098, + "learning_rate": 1.2164948453608248e-05, + "loss": 0.2653, + "step": 439 + }, + { + "epoch": 0.9055827116027785, + "grad_norm": 0.21359370648860931, + "learning_rate": 1.2142038946162657e-05, + "loss": 0.264, + "step": 440 + }, + { + "epoch": 0.9076408541291484, + "grad_norm": 0.1874193251132965, + "learning_rate": 1.2119129438717068e-05, + "loss": 0.2664, + "step": 441 + }, + { + "epoch": 0.9096989966555183, + "grad_norm": 0.19697226583957672, + "learning_rate": 1.2096219931271478e-05, + "loss": 0.2651, + "step": 442 + }, + { + "epoch": 0.9117571391818884, + "grad_norm": 0.20930957794189453, + "learning_rate": 1.2073310423825889e-05, + "loss": 0.2724, + "step": 443 + }, + { + "epoch": 0.9138152817082583, + "grad_norm": 0.19588977098464966, + "learning_rate": 1.2050400916380298e-05, + "loss": 0.2648, + "step": 444 + }, + { + "epoch": 0.9158734242346283, + "grad_norm": 0.19452017545700073, + "learning_rate": 1.2027491408934708e-05, + "loss": 0.2808, + "step": 445 + }, + { + "epoch": 0.9179315667609982, + "grad_norm": 0.19226408004760742, + "learning_rate": 1.2004581901489119e-05, + "loss": 0.2627, + "step": 446 + }, + { + "epoch": 0.9199897092873681, + "grad_norm": 0.18108274042606354, + "learning_rate": 1.198167239404353e-05, + "loss": 0.2693, + "step": 447 + }, + { + "epoch": 0.922047851813738, + "grad_norm": 0.19352363049983978, + "learning_rate": 1.1958762886597938e-05, + "loss": 0.2705, + "step": 448 + }, + { + "epoch": 0.9241059943401081, + "grad_norm": 0.18535122275352478, + "learning_rate": 1.1935853379152349e-05, + "loss": 0.2608, + "step": 449 + }, + { + "epoch": 0.926164136866478, + "grad_norm": 0.19209617376327515, + "learning_rate": 1.191294387170676e-05, + "loss": 0.2702, + "step": 450 + }, + { + "epoch": 0.928222279392848, + "grad_norm": 0.1866796910762787, + "learning_rate": 1.189003436426117e-05, + "loss": 0.264, + "step": 451 + }, + { + "epoch": 0.9302804219192179, + "grad_norm": 0.21708665788173676, + "learning_rate": 1.1867124856815579e-05, + "loss": 0.2693, + "step": 452 + }, + { + "epoch": 0.9323385644455878, + "grad_norm": 0.19297796487808228, + "learning_rate": 1.184421534936999e-05, + "loss": 0.2745, + "step": 453 + }, + { + "epoch": 0.9343967069719578, + "grad_norm": 0.19070400297641754, + "learning_rate": 1.18213058419244e-05, + "loss": 0.265, + "step": 454 + }, + { + "epoch": 0.9364548494983278, + "grad_norm": 0.19821566343307495, + "learning_rate": 1.1798396334478809e-05, + "loss": 0.2674, + "step": 455 + }, + { + "epoch": 0.9385129920246977, + "grad_norm": 0.2032192200422287, + "learning_rate": 1.177548682703322e-05, + "loss": 0.276, + "step": 456 + }, + { + "epoch": 0.9405711345510677, + "grad_norm": 0.19127750396728516, + "learning_rate": 1.175257731958763e-05, + "loss": 0.2696, + "step": 457 + }, + { + "epoch": 0.9426292770774376, + "grad_norm": 0.19187286496162415, + "learning_rate": 1.1729667812142041e-05, + "loss": 0.2601, + "step": 458 + }, + { + "epoch": 0.9446874196038075, + "grad_norm": 0.20871371030807495, + "learning_rate": 1.170675830469645e-05, + "loss": 0.2687, + "step": 459 + }, + { + "epoch": 0.9467455621301775, + "grad_norm": 0.19228306412696838, + "learning_rate": 1.168384879725086e-05, + "loss": 0.2633, + "step": 460 + }, + { + "epoch": 0.9488037046565475, + "grad_norm": 0.19025444984436035, + "learning_rate": 1.1660939289805271e-05, + "loss": 0.2721, + "step": 461 + }, + { + "epoch": 0.9508618471829174, + "grad_norm": 0.19476914405822754, + "learning_rate": 1.1638029782359682e-05, + "loss": 0.2662, + "step": 462 + }, + { + "epoch": 0.9529199897092874, + "grad_norm": 0.1991666853427887, + "learning_rate": 1.161512027491409e-05, + "loss": 0.269, + "step": 463 + }, + { + "epoch": 0.9549781322356573, + "grad_norm": 0.19385920464992523, + "learning_rate": 1.1592210767468501e-05, + "loss": 0.2647, + "step": 464 + }, + { + "epoch": 0.9570362747620272, + "grad_norm": 0.1911603957414627, + "learning_rate": 1.1569301260022912e-05, + "loss": 0.2679, + "step": 465 + }, + { + "epoch": 0.9590944172883972, + "grad_norm": 0.20373377203941345, + "learning_rate": 1.1546391752577319e-05, + "loss": 0.2694, + "step": 466 + }, + { + "epoch": 0.9611525598147672, + "grad_norm": 0.20550350844860077, + "learning_rate": 1.152348224513173e-05, + "loss": 0.2677, + "step": 467 + }, + { + "epoch": 0.9632107023411371, + "grad_norm": 0.2049354463815689, + "learning_rate": 1.150057273768614e-05, + "loss": 0.2752, + "step": 468 + }, + { + "epoch": 0.9652688448675071, + "grad_norm": 0.21691595017910004, + "learning_rate": 1.147766323024055e-05, + "loss": 0.2727, + "step": 469 + }, + { + "epoch": 0.967326987393877, + "grad_norm": 0.20727306604385376, + "learning_rate": 1.145475372279496e-05, + "loss": 0.2575, + "step": 470 + }, + { + "epoch": 0.969385129920247, + "grad_norm": 0.19166423380374908, + "learning_rate": 1.143184421534937e-05, + "loss": 0.2716, + "step": 471 + }, + { + "epoch": 0.9714432724466169, + "grad_norm": 0.18833886086940765, + "learning_rate": 1.140893470790378e-05, + "loss": 0.2651, + "step": 472 + }, + { + "epoch": 0.9735014149729869, + "grad_norm": 0.19680088758468628, + "learning_rate": 1.1386025200458191e-05, + "loss": 0.2621, + "step": 473 + }, + { + "epoch": 0.9755595574993569, + "grad_norm": 0.20966476202011108, + "learning_rate": 1.13631156930126e-05, + "loss": 0.2725, + "step": 474 + }, + { + "epoch": 0.9776177000257268, + "grad_norm": 0.1963450163602829, + "learning_rate": 1.134020618556701e-05, + "loss": 0.2569, + "step": 475 + }, + { + "epoch": 0.9796758425520967, + "grad_norm": 0.21289944648742676, + "learning_rate": 1.1317296678121421e-05, + "loss": 0.2622, + "step": 476 + }, + { + "epoch": 0.9817339850784667, + "grad_norm": 0.2103341966867447, + "learning_rate": 1.1294387170675832e-05, + "loss": 0.2803, + "step": 477 + }, + { + "epoch": 0.9837921276048366, + "grad_norm": 0.20202945172786713, + "learning_rate": 1.1271477663230241e-05, + "loss": 0.273, + "step": 478 + }, + { + "epoch": 0.9858502701312066, + "grad_norm": 0.18241006135940552, + "learning_rate": 1.1248568155784651e-05, + "loss": 0.2721, + "step": 479 + }, + { + "epoch": 0.9879084126575766, + "grad_norm": 0.19221259653568268, + "learning_rate": 1.1225658648339062e-05, + "loss": 0.2646, + "step": 480 + }, + { + "epoch": 0.9899665551839465, + "grad_norm": 0.19371837377548218, + "learning_rate": 1.1202749140893473e-05, + "loss": 0.2519, + "step": 481 + }, + { + "epoch": 0.9920246977103164, + "grad_norm": 0.1972094029188156, + "learning_rate": 1.1179839633447882e-05, + "loss": 0.2555, + "step": 482 + }, + { + "epoch": 0.9940828402366864, + "grad_norm": 0.19414126873016357, + "learning_rate": 1.1156930126002292e-05, + "loss": 0.2726, + "step": 483 + }, + { + "epoch": 0.9961409827630563, + "grad_norm": 0.18993492424488068, + "learning_rate": 1.1134020618556703e-05, + "loss": 0.2644, + "step": 484 + }, + { + "epoch": 0.9981991252894263, + "grad_norm": 0.19713927805423737, + "learning_rate": 1.1111111111111113e-05, + "loss": 0.2569, + "step": 485 + }, + { + "epoch": 1.00205814252637, + "grad_norm": 0.3423589766025543, + "learning_rate": 1.1088201603665522e-05, + "loss": 0.5285, + "step": 486 + }, + { + "epoch": 1.0041162850527399, + "grad_norm": 0.1901763528585434, + "learning_rate": 1.1065292096219933e-05, + "loss": 0.2621, + "step": 487 + }, + { + "epoch": 1.0061744275791098, + "grad_norm": 0.20508776605129242, + "learning_rate": 1.1042382588774343e-05, + "loss": 0.2665, + "step": 488 + }, + { + "epoch": 1.0082325701054797, + "grad_norm": 0.20188146829605103, + "learning_rate": 1.1019473081328752e-05, + "loss": 0.2547, + "step": 489 + }, + { + "epoch": 1.0102907126318497, + "grad_norm": 0.20245613157749176, + "learning_rate": 1.0996563573883163e-05, + "loss": 0.2657, + "step": 490 + }, + { + "epoch": 1.0123488551582196, + "grad_norm": 0.19711382687091827, + "learning_rate": 1.0973654066437574e-05, + "loss": 0.2597, + "step": 491 + }, + { + "epoch": 1.0144069976845898, + "grad_norm": 0.21538953483104706, + "learning_rate": 1.0950744558991984e-05, + "loss": 0.2727, + "step": 492 + }, + { + "epoch": 1.0164651402109597, + "grad_norm": 0.20296984910964966, + "learning_rate": 1.0927835051546391e-05, + "loss": 0.2634, + "step": 493 + }, + { + "epoch": 1.0185232827373296, + "grad_norm": 0.20134592056274414, + "learning_rate": 1.0904925544100802e-05, + "loss": 0.2596, + "step": 494 + }, + { + "epoch": 1.0205814252636995, + "grad_norm": 0.200101837515831, + "learning_rate": 1.0882016036655212e-05, + "loss": 0.2575, + "step": 495 + }, + { + "epoch": 1.0226395677900695, + "grad_norm": 0.19144928455352783, + "learning_rate": 1.0859106529209621e-05, + "loss": 0.263, + "step": 496 + }, + { + "epoch": 1.0246977103164394, + "grad_norm": 0.19832482933998108, + "learning_rate": 1.0836197021764032e-05, + "loss": 0.2656, + "step": 497 + }, + { + "epoch": 1.0267558528428093, + "grad_norm": 0.20965202152729034, + "learning_rate": 1.0813287514318443e-05, + "loss": 0.2611, + "step": 498 + }, + { + "epoch": 1.0288139953691793, + "grad_norm": 0.1974337100982666, + "learning_rate": 1.0790378006872853e-05, + "loss": 0.2667, + "step": 499 + }, + { + "epoch": 1.0308721378955492, + "grad_norm": 0.20611713826656342, + "learning_rate": 1.0767468499427262e-05, + "loss": 0.2674, + "step": 500 + }, + { + "epoch": 1.0308721378955492, + "eval_loss": 0.2836935222148895, + "eval_runtime": 2423.44, + "eval_samples_per_second": 3.208, + "eval_steps_per_second": 0.802, + "step": 500 + }, + { + "epoch": 1.0329302804219191, + "grad_norm": 0.202958345413208, + "learning_rate": 1.0744558991981673e-05, + "loss": 0.2684, + "step": 501 + }, + { + "epoch": 1.034988422948289, + "grad_norm": 0.1984429508447647, + "learning_rate": 1.0721649484536083e-05, + "loss": 0.2557, + "step": 502 + }, + { + "epoch": 1.0370465654746592, + "grad_norm": 0.19396482408046722, + "learning_rate": 1.0698739977090494e-05, + "loss": 0.255, + "step": 503 + }, + { + "epoch": 1.0391047080010292, + "grad_norm": 0.19176840782165527, + "learning_rate": 1.0675830469644903e-05, + "loss": 0.2675, + "step": 504 + }, + { + "epoch": 1.041162850527399, + "grad_norm": 0.20167966187000275, + "learning_rate": 1.0652920962199313e-05, + "loss": 0.2669, + "step": 505 + }, + { + "epoch": 1.043220993053769, + "grad_norm": 0.2049783617258072, + "learning_rate": 1.0630011454753724e-05, + "loss": 0.2446, + "step": 506 + }, + { + "epoch": 1.045279135580139, + "grad_norm": 0.19293472170829773, + "learning_rate": 1.0607101947308135e-05, + "loss": 0.256, + "step": 507 + }, + { + "epoch": 1.047337278106509, + "grad_norm": 0.19432370364665985, + "learning_rate": 1.0584192439862543e-05, + "loss": 0.2605, + "step": 508 + }, + { + "epoch": 1.0493954206328788, + "grad_norm": 0.19784876704216003, + "learning_rate": 1.0561282932416954e-05, + "loss": 0.2617, + "step": 509 + }, + { + "epoch": 1.0514535631592488, + "grad_norm": 0.19982090592384338, + "learning_rate": 1.0538373424971365e-05, + "loss": 0.264, + "step": 510 + }, + { + "epoch": 1.0535117056856187, + "grad_norm": 0.2019587755203247, + "learning_rate": 1.0515463917525775e-05, + "loss": 0.2543, + "step": 511 + }, + { + "epoch": 1.0555698482119886, + "grad_norm": 0.19848807156085968, + "learning_rate": 1.0492554410080184e-05, + "loss": 0.2613, + "step": 512 + }, + { + "epoch": 1.0576279907383586, + "grad_norm": 0.20360374450683594, + "learning_rate": 1.0469644902634595e-05, + "loss": 0.2675, + "step": 513 + }, + { + "epoch": 1.0596861332647285, + "grad_norm": 0.19209840893745422, + "learning_rate": 1.0446735395189005e-05, + "loss": 0.2517, + "step": 514 + }, + { + "epoch": 1.0617442757910984, + "grad_norm": 0.19142381846904755, + "learning_rate": 1.0423825887743416e-05, + "loss": 0.2631, + "step": 515 + }, + { + "epoch": 1.0638024183174686, + "grad_norm": 0.20222575962543488, + "learning_rate": 1.0400916380297825e-05, + "loss": 0.2625, + "step": 516 + }, + { + "epoch": 1.0658605608438385, + "grad_norm": 0.1984448879957199, + "learning_rate": 1.0378006872852235e-05, + "loss": 0.2584, + "step": 517 + }, + { + "epoch": 1.0679187033702084, + "grad_norm": 0.1992885023355484, + "learning_rate": 1.0355097365406646e-05, + "loss": 0.2609, + "step": 518 + }, + { + "epoch": 1.0699768458965784, + "grad_norm": 0.20708978176116943, + "learning_rate": 1.0332187857961057e-05, + "loss": 0.2618, + "step": 519 + }, + { + "epoch": 1.0720349884229483, + "grad_norm": 0.22806766629219055, + "learning_rate": 1.0309278350515464e-05, + "loss": 0.2634, + "step": 520 + }, + { + "epoch": 1.0740931309493182, + "grad_norm": 0.2019941806793213, + "learning_rate": 1.0286368843069874e-05, + "loss": 0.2588, + "step": 521 + }, + { + "epoch": 1.0761512734756882, + "grad_norm": 0.19460470974445343, + "learning_rate": 1.0263459335624283e-05, + "loss": 0.2692, + "step": 522 + }, + { + "epoch": 1.078209416002058, + "grad_norm": 0.19483187794685364, + "learning_rate": 1.0240549828178694e-05, + "loss": 0.2474, + "step": 523 + }, + { + "epoch": 1.080267558528428, + "grad_norm": 0.2199576050043106, + "learning_rate": 1.0217640320733104e-05, + "loss": 0.2582, + "step": 524 + }, + { + "epoch": 1.082325701054798, + "grad_norm": 0.20485302805900574, + "learning_rate": 1.0194730813287515e-05, + "loss": 0.2463, + "step": 525 + }, + { + "epoch": 1.084383843581168, + "grad_norm": 0.20773454010486603, + "learning_rate": 1.0171821305841924e-05, + "loss": 0.2501, + "step": 526 + }, + { + "epoch": 1.086441986107538, + "grad_norm": 0.19593262672424316, + "learning_rate": 1.0148911798396335e-05, + "loss": 0.2608, + "step": 527 + }, + { + "epoch": 1.088500128633908, + "grad_norm": 0.20500554144382477, + "learning_rate": 1.0126002290950745e-05, + "loss": 0.2586, + "step": 528 + }, + { + "epoch": 1.090558271160278, + "grad_norm": 0.19919747114181519, + "learning_rate": 1.0103092783505156e-05, + "loss": 0.2724, + "step": 529 + }, + { + "epoch": 1.0926164136866479, + "grad_norm": 0.1953326314687729, + "learning_rate": 1.0080183276059565e-05, + "loss": 0.2456, + "step": 530 + }, + { + "epoch": 1.0946745562130178, + "grad_norm": 0.2155047059059143, + "learning_rate": 1.0057273768613975e-05, + "loss": 0.2644, + "step": 531 + }, + { + "epoch": 1.0967326987393877, + "grad_norm": 0.19747495651245117, + "learning_rate": 1.0034364261168386e-05, + "loss": 0.2539, + "step": 532 + }, + { + "epoch": 1.0987908412657577, + "grad_norm": 0.20261652767658234, + "learning_rate": 1.0011454753722796e-05, + "loss": 0.255, + "step": 533 + }, + { + "epoch": 1.1008489837921276, + "grad_norm": 0.19529719650745392, + "learning_rate": 9.988545246277205e-06, + "loss": 0.2489, + "step": 534 + }, + { + "epoch": 1.1029071263184975, + "grad_norm": 0.20239490270614624, + "learning_rate": 9.965635738831616e-06, + "loss": 0.2664, + "step": 535 + }, + { + "epoch": 1.1049652688448675, + "grad_norm": 0.19377024471759796, + "learning_rate": 9.942726231386026e-06, + "loss": 0.2615, + "step": 536 + }, + { + "epoch": 1.1070234113712374, + "grad_norm": 0.20523156225681305, + "learning_rate": 9.919816723940437e-06, + "loss": 0.2548, + "step": 537 + }, + { + "epoch": 1.1090815538976073, + "grad_norm": 0.2046228051185608, + "learning_rate": 9.896907216494846e-06, + "loss": 0.2704, + "step": 538 + }, + { + "epoch": 1.1111396964239773, + "grad_norm": 0.21209484338760376, + "learning_rate": 9.873997709049257e-06, + "loss": 0.2637, + "step": 539 + }, + { + "epoch": 1.1131978389503474, + "grad_norm": 0.20251420140266418, + "learning_rate": 9.851088201603667e-06, + "loss": 0.2617, + "step": 540 + }, + { + "epoch": 1.1152559814767173, + "grad_norm": 0.21695846319198608, + "learning_rate": 9.828178694158076e-06, + "loss": 0.2658, + "step": 541 + }, + { + "epoch": 1.1173141240030873, + "grad_norm": 0.2015303075313568, + "learning_rate": 9.805269186712487e-06, + "loss": 0.2528, + "step": 542 + }, + { + "epoch": 1.1193722665294572, + "grad_norm": 0.21796390414237976, + "learning_rate": 9.782359679266896e-06, + "loss": 0.2625, + "step": 543 + }, + { + "epoch": 1.1214304090558271, + "grad_norm": 0.20676304399967194, + "learning_rate": 9.759450171821306e-06, + "loss": 0.268, + "step": 544 + }, + { + "epoch": 1.123488551582197, + "grad_norm": 0.1986500769853592, + "learning_rate": 9.736540664375717e-06, + "loss": 0.2546, + "step": 545 + }, + { + "epoch": 1.125546694108567, + "grad_norm": 0.20008589327335358, + "learning_rate": 9.713631156930127e-06, + "loss": 0.2525, + "step": 546 + }, + { + "epoch": 1.127604836634937, + "grad_norm": 0.1891598105430603, + "learning_rate": 9.690721649484536e-06, + "loss": 0.256, + "step": 547 + }, + { + "epoch": 1.1296629791613069, + "grad_norm": 0.20968230068683624, + "learning_rate": 9.667812142038947e-06, + "loss": 0.2495, + "step": 548 + }, + { + "epoch": 1.1317211216876768, + "grad_norm": 0.2025834023952484, + "learning_rate": 9.644902634593357e-06, + "loss": 0.2533, + "step": 549 + }, + { + "epoch": 1.1337792642140467, + "grad_norm": 0.21087367832660675, + "learning_rate": 9.621993127147768e-06, + "loss": 0.2518, + "step": 550 + }, + { + "epoch": 1.1358374067404169, + "grad_norm": 0.20784996449947357, + "learning_rate": 9.599083619702177e-06, + "loss": 0.2594, + "step": 551 + }, + { + "epoch": 1.1378955492667868, + "grad_norm": 0.20754118263721466, + "learning_rate": 9.576174112256587e-06, + "loss": 0.2515, + "step": 552 + }, + { + "epoch": 1.1399536917931568, + "grad_norm": 0.225090891122818, + "learning_rate": 9.553264604810998e-06, + "loss": 0.2615, + "step": 553 + }, + { + "epoch": 1.1420118343195267, + "grad_norm": 0.24656590819358826, + "learning_rate": 9.530355097365407e-06, + "loss": 0.2636, + "step": 554 + }, + { + "epoch": 1.1440699768458966, + "grad_norm": 0.22454337775707245, + "learning_rate": 9.507445589919818e-06, + "loss": 0.2584, + "step": 555 + }, + { + "epoch": 1.1461281193722666, + "grad_norm": 0.2229425013065338, + "learning_rate": 9.484536082474226e-06, + "loss": 0.2543, + "step": 556 + }, + { + "epoch": 1.1481862618986365, + "grad_norm": 0.18805071711540222, + "learning_rate": 9.461626575028637e-06, + "loss": 0.2593, + "step": 557 + }, + { + "epoch": 1.1502444044250064, + "grad_norm": 0.23163346946239471, + "learning_rate": 9.438717067583048e-06, + "loss": 0.2537, + "step": 558 + }, + { + "epoch": 1.1523025469513763, + "grad_norm": 0.2126983255147934, + "learning_rate": 9.415807560137458e-06, + "loss": 0.2598, + "step": 559 + }, + { + "epoch": 1.1543606894777463, + "grad_norm": 0.2113332748413086, + "learning_rate": 9.392898052691867e-06, + "loss": 0.2617, + "step": 560 + }, + { + "epoch": 1.1564188320041162, + "grad_norm": 0.2220505177974701, + "learning_rate": 9.369988545246278e-06, + "loss": 0.2673, + "step": 561 + }, + { + "epoch": 1.1584769745304861, + "grad_norm": 0.21683354675769806, + "learning_rate": 9.347079037800688e-06, + "loss": 0.259, + "step": 562 + }, + { + "epoch": 1.160535117056856, + "grad_norm": 0.20226940512657166, + "learning_rate": 9.324169530355099e-06, + "loss": 0.2536, + "step": 563 + }, + { + "epoch": 1.1625932595832262, + "grad_norm": 0.2166106402873993, + "learning_rate": 9.301260022909508e-06, + "loss": 0.2573, + "step": 564 + }, + { + "epoch": 1.1646514021095962, + "grad_norm": 0.21802830696105957, + "learning_rate": 9.278350515463918e-06, + "loss": 0.2604, + "step": 565 + }, + { + "epoch": 1.166709544635966, + "grad_norm": 0.19723279774188995, + "learning_rate": 9.255441008018329e-06, + "loss": 0.2643, + "step": 566 + }, + { + "epoch": 1.168767687162336, + "grad_norm": 0.20100893080234528, + "learning_rate": 9.23253150057274e-06, + "loss": 0.2601, + "step": 567 + }, + { + "epoch": 1.170825829688706, + "grad_norm": 0.19834032654762268, + "learning_rate": 9.209621993127148e-06, + "loss": 0.2624, + "step": 568 + }, + { + "epoch": 1.172883972215076, + "grad_norm": 0.20677493512630463, + "learning_rate": 9.186712485681557e-06, + "loss": 0.2527, + "step": 569 + }, + { + "epoch": 1.1749421147414458, + "grad_norm": 0.20895297825336456, + "learning_rate": 9.163802978235968e-06, + "loss": 0.2519, + "step": 570 + }, + { + "epoch": 1.1770002572678158, + "grad_norm": 0.19748030602931976, + "learning_rate": 9.140893470790379e-06, + "loss": 0.2567, + "step": 571 + }, + { + "epoch": 1.1790583997941857, + "grad_norm": 0.20713521540164948, + "learning_rate": 9.117983963344789e-06, + "loss": 0.2771, + "step": 572 + }, + { + "epoch": 1.1811165423205556, + "grad_norm": 0.2146754264831543, + "learning_rate": 9.095074455899198e-06, + "loss": 0.2537, + "step": 573 + }, + { + "epoch": 1.1831746848469256, + "grad_norm": 0.20723004639148712, + "learning_rate": 9.072164948453609e-06, + "loss": 0.253, + "step": 574 + }, + { + "epoch": 1.1852328273732957, + "grad_norm": 0.2072172611951828, + "learning_rate": 9.04925544100802e-06, + "loss": 0.2545, + "step": 575 + }, + { + "epoch": 1.1872909698996654, + "grad_norm": 0.20537281036376953, + "learning_rate": 9.02634593356243e-06, + "loss": 0.2517, + "step": 576 + }, + { + "epoch": 1.1893491124260356, + "grad_norm": 0.21034401655197144, + "learning_rate": 9.003436426116839e-06, + "loss": 0.2506, + "step": 577 + }, + { + "epoch": 1.1914072549524055, + "grad_norm": 0.21373845636844635, + "learning_rate": 8.98052691867125e-06, + "loss": 0.2544, + "step": 578 + }, + { + "epoch": 1.1934653974787754, + "grad_norm": 0.22282572090625763, + "learning_rate": 8.95761741122566e-06, + "loss": 0.2607, + "step": 579 + }, + { + "epoch": 1.1955235400051454, + "grad_norm": 0.20421402156352997, + "learning_rate": 8.93470790378007e-06, + "loss": 0.2636, + "step": 580 + }, + { + "epoch": 1.1975816825315153, + "grad_norm": 0.2095903605222702, + "learning_rate": 8.91179839633448e-06, + "loss": 0.2627, + "step": 581 + }, + { + "epoch": 1.1996398250578852, + "grad_norm": 0.2215132862329483, + "learning_rate": 8.888888888888888e-06, + "loss": 0.2651, + "step": 582 + }, + { + "epoch": 1.2016979675842552, + "grad_norm": 0.22536343336105347, + "learning_rate": 8.865979381443299e-06, + "loss": 0.2548, + "step": 583 + }, + { + "epoch": 1.203756110110625, + "grad_norm": 0.19969668984413147, + "learning_rate": 8.84306987399771e-06, + "loss": 0.2646, + "step": 584 + }, + { + "epoch": 1.205814252636995, + "grad_norm": 0.225993350148201, + "learning_rate": 8.82016036655212e-06, + "loss": 0.2607, + "step": 585 + }, + { + "epoch": 1.207872395163365, + "grad_norm": 0.19197311997413635, + "learning_rate": 8.797250859106529e-06, + "loss": 0.2519, + "step": 586 + }, + { + "epoch": 1.209930537689735, + "grad_norm": 0.1974429190158844, + "learning_rate": 8.77434135166094e-06, + "loss": 0.2512, + "step": 587 + }, + { + "epoch": 1.211988680216105, + "grad_norm": 0.19816122949123383, + "learning_rate": 8.75143184421535e-06, + "loss": 0.2582, + "step": 588 + }, + { + "epoch": 1.214046822742475, + "grad_norm": 0.20259711146354675, + "learning_rate": 8.72852233676976e-06, + "loss": 0.2561, + "step": 589 + }, + { + "epoch": 1.216104965268845, + "grad_norm": 0.23857274651527405, + "learning_rate": 8.70561282932417e-06, + "loss": 0.2574, + "step": 590 + }, + { + "epoch": 1.2181631077952149, + "grad_norm": 0.2108597606420517, + "learning_rate": 8.68270332187858e-06, + "loss": 0.2546, + "step": 591 + }, + { + "epoch": 1.2202212503215848, + "grad_norm": 0.20933857560157776, + "learning_rate": 8.65979381443299e-06, + "loss": 0.2527, + "step": 592 + }, + { + "epoch": 1.2222793928479547, + "grad_norm": 0.19276075065135956, + "learning_rate": 8.636884306987401e-06, + "loss": 0.26, + "step": 593 + }, + { + "epoch": 1.2243375353743247, + "grad_norm": 0.2111658900976181, + "learning_rate": 8.61397479954181e-06, + "loss": 0.267, + "step": 594 + }, + { + "epoch": 1.2263956779006946, + "grad_norm": 0.20039953291416168, + "learning_rate": 8.591065292096221e-06, + "loss": 0.2454, + "step": 595 + }, + { + "epoch": 1.2284538204270645, + "grad_norm": 0.212934210896492, + "learning_rate": 8.56815578465063e-06, + "loss": 0.2674, + "step": 596 + }, + { + "epoch": 1.2305119629534345, + "grad_norm": 0.2036072462797165, + "learning_rate": 8.54524627720504e-06, + "loss": 0.2613, + "step": 597 + }, + { + "epoch": 1.2325701054798044, + "grad_norm": 0.20735019445419312, + "learning_rate": 8.522336769759451e-06, + "loss": 0.2648, + "step": 598 + }, + { + "epoch": 1.2346282480061745, + "grad_norm": 0.2097824215888977, + "learning_rate": 8.49942726231386e-06, + "loss": 0.2535, + "step": 599 + }, + { + "epoch": 1.2366863905325443, + "grad_norm": 0.19988034665584564, + "learning_rate": 8.47651775486827e-06, + "loss": 0.2507, + "step": 600 + }, + { + "epoch": 1.2366863905325443, + "eval_loss": 0.28046268224716187, + "eval_runtime": 2441.2385, + "eval_samples_per_second": 3.184, + "eval_steps_per_second": 0.796, + "step": 600 + } + ], + "logging_steps": 1, + "max_steps": 970, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.9815711414309765e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/training_args.bin b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5999c7ee9dd10ee9076d748e4757533e635fa832 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee55a11f5a306eb7c39b536fdfe2459bc279e468da50f6adda478c4deffcb812 +size 5688 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/README.md b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0d77d70fdc5c829c8889cb85828736b7eb9714 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/codegemma-7b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/adapter_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e841602c6a59fc7b085ac647af4d4c312445d261 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/codegemma-7b-it", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "q_proj", + "down_proj", + "gate_proj", + "k_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/adapter_model.safetensors b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..be70a87fa00ec5058838dfa989ad17fcfa13a276 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db2e504d0bd0428054b4a12e3c8ba35f5bd2c5e10f9e73d460a820f85eb1755e +size 800116456 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/optimizer.pt b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..efa6080deca0972578a14740e1625f165fc533b1 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3e8a54f00486fed86e1b5824dc84f5ed8ec6205cee89cd31117cafd37a47198 +size 406743860 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/rng_state.pth b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0656c37445b705388e501905cdb7321a426e6845 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:448594757d6de5d1e88c4f56dc0f856ce5b411ae6f4a9716f1743b750f7387d1 +size 14244 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/scheduler.pt b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..bba8136b16abd7cfb9f7b0a4737ca4bdd1ff7022 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:485b79e445bb43945018360c12d8a67fa5bf3d9bdd235b5d703182382aa6fa61 +size 1064 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/special_tokens_map.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/tokenizer.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..45a5e23f54141c5f4f97a8d58f3ffadc28e287ba --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d964a2c8346d40f95791533eae48730d5f163c2e65fd16333560fd3e661df318 +size 34362915 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/tokenizer.model b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..71a98ce40269d847e58957e1e070d9ae8eb184af --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:583f2ebd2a1936009b7da991ea255504db68c7a9713a78673d1335a87098966c +size 4241023 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/tokenizer_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9b9b1b4acdd4afcedae39d1cf6f0bc7ef7d9910f --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/tokenizer_config.json @@ -0,0 +1,2011 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "<|file_separator|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "left", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/trainer_state.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d09e192f0d31f5cede1602395e39878fbb2d3f65 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/trainer_state.json @@ -0,0 +1,4990 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.4425006431695395, + "eval_steps": 100, + "global_step": 700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0020581425263699513, + "grad_norm": 11.994463920593262, + "learning_rate": 2.061855670103093e-07, + "loss": 2.91, + "step": 1 + }, + { + "epoch": 0.004116285052739903, + "grad_norm": 11.769092559814453, + "learning_rate": 4.123711340206186e-07, + "loss": 2.8686, + "step": 2 + }, + { + "epoch": 0.0061744275791098535, + "grad_norm": 13.05551815032959, + "learning_rate": 6.185567010309279e-07, + "loss": 3.0286, + "step": 3 + }, + { + "epoch": 0.008232570105479805, + "grad_norm": 12.334521293640137, + "learning_rate": 8.247422680412372e-07, + "loss": 2.904, + "step": 4 + }, + { + "epoch": 0.010290712631849755, + "grad_norm": 12.075353622436523, + "learning_rate": 1.0309278350515464e-06, + "loss": 2.8991, + "step": 5 + }, + { + "epoch": 0.012348855158219707, + "grad_norm": 11.86032485961914, + "learning_rate": 1.2371134020618557e-06, + "loss": 3.0007, + "step": 6 + }, + { + "epoch": 0.014406997684589657, + "grad_norm": 10.10457992553711, + "learning_rate": 1.4432989690721649e-06, + "loss": 2.8493, + "step": 7 + }, + { + "epoch": 0.01646514021095961, + "grad_norm": 8.56408405303955, + "learning_rate": 1.6494845360824744e-06, + "loss": 2.9573, + "step": 8 + }, + { + "epoch": 0.01852328273732956, + "grad_norm": 6.307392120361328, + "learning_rate": 1.8556701030927837e-06, + "loss": 2.9507, + "step": 9 + }, + { + "epoch": 0.02058142526369951, + "grad_norm": 4.276430130004883, + "learning_rate": 2.061855670103093e-06, + "loss": 2.8988, + "step": 10 + }, + { + "epoch": 0.022639567790069464, + "grad_norm": 2.5912015438079834, + "learning_rate": 2.268041237113402e-06, + "loss": 2.9926, + "step": 11 + }, + { + "epoch": 0.024697710316439414, + "grad_norm": 2.018446207046509, + "learning_rate": 2.4742268041237115e-06, + "loss": 2.9874, + "step": 12 + }, + { + "epoch": 0.026755852842809364, + "grad_norm": 1.8558588027954102, + "learning_rate": 2.680412371134021e-06, + "loss": 2.8608, + "step": 13 + }, + { + "epoch": 0.028813995369179314, + "grad_norm": 1.9658265113830566, + "learning_rate": 2.8865979381443297e-06, + "loss": 2.8596, + "step": 14 + }, + { + "epoch": 0.030872137895549268, + "grad_norm": 1.872044563293457, + "learning_rate": 3.0927835051546395e-06, + "loss": 2.8836, + "step": 15 + }, + { + "epoch": 0.03293028042191922, + "grad_norm": 1.8884096145629883, + "learning_rate": 3.298969072164949e-06, + "loss": 2.9383, + "step": 16 + }, + { + "epoch": 0.03498842294828917, + "grad_norm": 1.8795744180679321, + "learning_rate": 3.5051546391752577e-06, + "loss": 2.883, + "step": 17 + }, + { + "epoch": 0.03704656547465912, + "grad_norm": 1.783678412437439, + "learning_rate": 3.7113402061855674e-06, + "loss": 2.8019, + "step": 18 + }, + { + "epoch": 0.039104708001029075, + "grad_norm": 1.820617914199829, + "learning_rate": 3.917525773195877e-06, + "loss": 2.8813, + "step": 19 + }, + { + "epoch": 0.04116285052739902, + "grad_norm": 1.8188731670379639, + "learning_rate": 4.123711340206186e-06, + "loss": 2.8401, + "step": 20 + }, + { + "epoch": 0.043220993053768975, + "grad_norm": 1.7305251359939575, + "learning_rate": 4.329896907216495e-06, + "loss": 2.7478, + "step": 21 + }, + { + "epoch": 0.04527913558013893, + "grad_norm": 1.7014551162719727, + "learning_rate": 4.536082474226804e-06, + "loss": 2.7356, + "step": 22 + }, + { + "epoch": 0.047337278106508875, + "grad_norm": 1.677381157875061, + "learning_rate": 4.742268041237113e-06, + "loss": 2.7593, + "step": 23 + }, + { + "epoch": 0.04939542063287883, + "grad_norm": 1.628554344177246, + "learning_rate": 4.948453608247423e-06, + "loss": 2.7689, + "step": 24 + }, + { + "epoch": 0.051453563159248775, + "grad_norm": 1.4968128204345703, + "learning_rate": 5.154639175257732e-06, + "loss": 2.6613, + "step": 25 + }, + { + "epoch": 0.05351170568561873, + "grad_norm": 1.4734832048416138, + "learning_rate": 5.360824742268042e-06, + "loss": 2.7095, + "step": 26 + }, + { + "epoch": 0.05556984821198868, + "grad_norm": 1.3745571374893188, + "learning_rate": 5.567010309278351e-06, + "loss": 2.655, + "step": 27 + }, + { + "epoch": 0.05762799073835863, + "grad_norm": 1.3381729125976562, + "learning_rate": 5.7731958762886594e-06, + "loss": 2.55, + "step": 28 + }, + { + "epoch": 0.05968613326472858, + "grad_norm": 1.3388073444366455, + "learning_rate": 5.979381443298969e-06, + "loss": 2.5219, + "step": 29 + }, + { + "epoch": 0.061744275791098535, + "grad_norm": 1.317008376121521, + "learning_rate": 6.185567010309279e-06, + "loss": 2.4491, + "step": 30 + }, + { + "epoch": 0.06380241831746848, + "grad_norm": 1.3210794925689697, + "learning_rate": 6.391752577319588e-06, + "loss": 2.4358, + "step": 31 + }, + { + "epoch": 0.06586056084383844, + "grad_norm": 1.182519555091858, + "learning_rate": 6.597938144329898e-06, + "loss": 2.4514, + "step": 32 + }, + { + "epoch": 0.06791870337020839, + "grad_norm": 1.2238099575042725, + "learning_rate": 6.804123711340207e-06, + "loss": 2.442, + "step": 33 + }, + { + "epoch": 0.06997684589657834, + "grad_norm": 1.1793314218521118, + "learning_rate": 7.010309278350515e-06, + "loss": 2.3864, + "step": 34 + }, + { + "epoch": 0.0720349884229483, + "grad_norm": 1.1983020305633545, + "learning_rate": 7.216494845360825e-06, + "loss": 2.3796, + "step": 35 + }, + { + "epoch": 0.07409313094931824, + "grad_norm": 1.2189652919769287, + "learning_rate": 7.422680412371135e-06, + "loss": 2.4152, + "step": 36 + }, + { + "epoch": 0.07615127347568819, + "grad_norm": 1.14923095703125, + "learning_rate": 7.628865979381444e-06, + "loss": 2.3298, + "step": 37 + }, + { + "epoch": 0.07820941600205815, + "grad_norm": 1.147013545036316, + "learning_rate": 7.835051546391754e-06, + "loss": 2.2488, + "step": 38 + }, + { + "epoch": 0.0802675585284281, + "grad_norm": 1.133981466293335, + "learning_rate": 8.041237113402063e-06, + "loss": 2.1825, + "step": 39 + }, + { + "epoch": 0.08232570105479804, + "grad_norm": 1.1686867475509644, + "learning_rate": 8.247422680412371e-06, + "loss": 2.2282, + "step": 40 + }, + { + "epoch": 0.084383843581168, + "grad_norm": 1.131690502166748, + "learning_rate": 8.453608247422681e-06, + "loss": 2.0962, + "step": 41 + }, + { + "epoch": 0.08644198610753795, + "grad_norm": 1.1626195907592773, + "learning_rate": 8.65979381443299e-06, + "loss": 2.1161, + "step": 42 + }, + { + "epoch": 0.0885001286339079, + "grad_norm": 1.1508581638336182, + "learning_rate": 8.865979381443299e-06, + "loss": 1.9856, + "step": 43 + }, + { + "epoch": 0.09055827116027786, + "grad_norm": 1.2286733388900757, + "learning_rate": 9.072164948453609e-06, + "loss": 2.076, + "step": 44 + }, + { + "epoch": 0.0926164136866478, + "grad_norm": 1.82068932056427, + "learning_rate": 9.278350515463918e-06, + "loss": 1.9995, + "step": 45 + }, + { + "epoch": 0.09467455621301775, + "grad_norm": 2.079101324081421, + "learning_rate": 9.484536082474226e-06, + "loss": 1.9601, + "step": 46 + }, + { + "epoch": 0.0967326987393877, + "grad_norm": 1.1209226846694946, + "learning_rate": 9.690721649484536e-06, + "loss": 1.9346, + "step": 47 + }, + { + "epoch": 0.09879084126575766, + "grad_norm": 1.0579711198806763, + "learning_rate": 9.896907216494846e-06, + "loss": 1.8764, + "step": 48 + }, + { + "epoch": 0.1008489837921276, + "grad_norm": 1.0434011220932007, + "learning_rate": 1.0103092783505156e-05, + "loss": 1.8483, + "step": 49 + }, + { + "epoch": 0.10290712631849755, + "grad_norm": 1.0089991092681885, + "learning_rate": 1.0309278350515464e-05, + "loss": 1.8018, + "step": 50 + }, + { + "epoch": 0.10496526884486751, + "grad_norm": 1.0117324590682983, + "learning_rate": 1.0515463917525775e-05, + "loss": 1.8003, + "step": 51 + }, + { + "epoch": 0.10702341137123746, + "grad_norm": 1.0006697177886963, + "learning_rate": 1.0721649484536083e-05, + "loss": 1.7482, + "step": 52 + }, + { + "epoch": 0.1090815538976074, + "grad_norm": 2.1164329051971436, + "learning_rate": 1.0927835051546391e-05, + "loss": 1.7363, + "step": 53 + }, + { + "epoch": 0.11113969642397736, + "grad_norm": 0.9573502540588379, + "learning_rate": 1.1134020618556703e-05, + "loss": 1.661, + "step": 54 + }, + { + "epoch": 0.11319783895034731, + "grad_norm": 1.0059764385223389, + "learning_rate": 1.134020618556701e-05, + "loss": 1.6979, + "step": 55 + }, + { + "epoch": 0.11525598147671726, + "grad_norm": 0.9719656109809875, + "learning_rate": 1.1546391752577319e-05, + "loss": 1.6318, + "step": 56 + }, + { + "epoch": 0.11731412400308722, + "grad_norm": 1.0024539232254028, + "learning_rate": 1.175257731958763e-05, + "loss": 1.6283, + "step": 57 + }, + { + "epoch": 0.11937226652945716, + "grad_norm": 0.9772456288337708, + "learning_rate": 1.1958762886597938e-05, + "loss": 1.5611, + "step": 58 + }, + { + "epoch": 0.12143040905582711, + "grad_norm": 0.9947625994682312, + "learning_rate": 1.2164948453608248e-05, + "loss": 1.6073, + "step": 59 + }, + { + "epoch": 0.12348855158219707, + "grad_norm": 2.112889051437378, + "learning_rate": 1.2371134020618558e-05, + "loss": 1.6208, + "step": 60 + }, + { + "epoch": 0.12554669410856703, + "grad_norm": 1.0515345335006714, + "learning_rate": 1.2577319587628866e-05, + "loss": 1.569, + "step": 61 + }, + { + "epoch": 0.12760483663493696, + "grad_norm": 1.0782145261764526, + "learning_rate": 1.2783505154639176e-05, + "loss": 1.5097, + "step": 62 + }, + { + "epoch": 0.12966297916130692, + "grad_norm": 1.154104232788086, + "learning_rate": 1.2989690721649485e-05, + "loss": 1.5472, + "step": 63 + }, + { + "epoch": 0.13172112168767688, + "grad_norm": 1.1614656448364258, + "learning_rate": 1.3195876288659795e-05, + "loss": 1.4833, + "step": 64 + }, + { + "epoch": 0.13377926421404682, + "grad_norm": 1.1720911264419556, + "learning_rate": 1.3402061855670103e-05, + "loss": 1.4644, + "step": 65 + }, + { + "epoch": 0.13583740674041678, + "grad_norm": 1.8903896808624268, + "learning_rate": 1.3608247422680415e-05, + "loss": 1.4286, + "step": 66 + }, + { + "epoch": 0.13789554926678674, + "grad_norm": 1.2675013542175293, + "learning_rate": 1.3814432989690723e-05, + "loss": 1.416, + "step": 67 + }, + { + "epoch": 0.13995369179315667, + "grad_norm": 1.266434907913208, + "learning_rate": 1.402061855670103e-05, + "loss": 1.3171, + "step": 68 + }, + { + "epoch": 0.14201183431952663, + "grad_norm": 1.3408889770507812, + "learning_rate": 1.4226804123711342e-05, + "loss": 1.3396, + "step": 69 + }, + { + "epoch": 0.1440699768458966, + "grad_norm": 1.3862446546554565, + "learning_rate": 1.443298969072165e-05, + "loss": 1.2642, + "step": 70 + }, + { + "epoch": 0.14612811937226652, + "grad_norm": 2.110553026199341, + "learning_rate": 1.4639175257731958e-05, + "loss": 1.2593, + "step": 71 + }, + { + "epoch": 0.14818626189863648, + "grad_norm": 1.7017499208450317, + "learning_rate": 1.484536082474227e-05, + "loss": 1.24, + "step": 72 + }, + { + "epoch": 0.15024440442500644, + "grad_norm": 1.9851700067520142, + "learning_rate": 1.5051546391752578e-05, + "loss": 1.2313, + "step": 73 + }, + { + "epoch": 0.15230254695137638, + "grad_norm": 2.009608030319214, + "learning_rate": 1.5257731958762888e-05, + "loss": 1.1281, + "step": 74 + }, + { + "epoch": 0.15436068947774634, + "grad_norm": 2.7587485313415527, + "learning_rate": 1.5463917525773197e-05, + "loss": 1.1248, + "step": 75 + }, + { + "epoch": 0.1564188320041163, + "grad_norm": 2.780954599380493, + "learning_rate": 1.5670103092783507e-05, + "loss": 1.0797, + "step": 76 + }, + { + "epoch": 0.15847697453048623, + "grad_norm": 3.1470866203308105, + "learning_rate": 1.5876288659793813e-05, + "loss": 1.0064, + "step": 77 + }, + { + "epoch": 0.1605351170568562, + "grad_norm": 4.653595447540283, + "learning_rate": 1.6082474226804127e-05, + "loss": 0.9219, + "step": 78 + }, + { + "epoch": 0.16259325958322615, + "grad_norm": 4.157363414764404, + "learning_rate": 1.6288659793814433e-05, + "loss": 0.8709, + "step": 79 + }, + { + "epoch": 0.16465140210959608, + "grad_norm": 4.5814924240112305, + "learning_rate": 1.6494845360824743e-05, + "loss": 0.7693, + "step": 80 + }, + { + "epoch": 0.16670954463596604, + "grad_norm": 5.096139907836914, + "learning_rate": 1.6701030927835052e-05, + "loss": 0.6868, + "step": 81 + }, + { + "epoch": 0.168767687162336, + "grad_norm": 4.858880519866943, + "learning_rate": 1.6907216494845362e-05, + "loss": 0.5971, + "step": 82 + }, + { + "epoch": 0.17082582968870594, + "grad_norm": 4.42564582824707, + "learning_rate": 1.7113402061855672e-05, + "loss": 0.4719, + "step": 83 + }, + { + "epoch": 0.1728839722150759, + "grad_norm": 7.720851421356201, + "learning_rate": 1.731958762886598e-05, + "loss": 0.3943, + "step": 84 + }, + { + "epoch": 0.17494211474144586, + "grad_norm": 0.41923192143440247, + "learning_rate": 1.752577319587629e-05, + "loss": 0.3635, + "step": 85 + }, + { + "epoch": 0.1770002572678158, + "grad_norm": 0.2771846354007721, + "learning_rate": 1.7731958762886598e-05, + "loss": 0.3597, + "step": 86 + }, + { + "epoch": 0.17905839979418575, + "grad_norm": 0.24761857092380524, + "learning_rate": 1.793814432989691e-05, + "loss": 0.3735, + "step": 87 + }, + { + "epoch": 0.1811165423205557, + "grad_norm": 0.23277048766613007, + "learning_rate": 1.8144329896907217e-05, + "loss": 0.3643, + "step": 88 + }, + { + "epoch": 0.18317468484692565, + "grad_norm": 0.22931228578090668, + "learning_rate": 1.8350515463917527e-05, + "loss": 0.3519, + "step": 89 + }, + { + "epoch": 0.1852328273732956, + "grad_norm": 0.20750615000724792, + "learning_rate": 1.8556701030927837e-05, + "loss": 0.3431, + "step": 90 + }, + { + "epoch": 0.18729096989966554, + "grad_norm": 0.2080322951078415, + "learning_rate": 1.8762886597938147e-05, + "loss": 0.3632, + "step": 91 + }, + { + "epoch": 0.1893491124260355, + "grad_norm": 0.20186181366443634, + "learning_rate": 1.8969072164948453e-05, + "loss": 0.3492, + "step": 92 + }, + { + "epoch": 0.19140725495240546, + "grad_norm": 0.19172786176204681, + "learning_rate": 1.9175257731958766e-05, + "loss": 0.3552, + "step": 93 + }, + { + "epoch": 0.1934653974787754, + "grad_norm": 0.1747850626707077, + "learning_rate": 1.9381443298969072e-05, + "loss": 0.3355, + "step": 94 + }, + { + "epoch": 0.19552354000514535, + "grad_norm": 0.196411594748497, + "learning_rate": 1.9587628865979382e-05, + "loss": 0.3271, + "step": 95 + }, + { + "epoch": 0.1975816825315153, + "grad_norm": 0.20063228905200958, + "learning_rate": 1.9793814432989692e-05, + "loss": 0.3351, + "step": 96 + }, + { + "epoch": 0.19963982505788525, + "grad_norm": 0.19240939617156982, + "learning_rate": 2e-05, + "loss": 0.3266, + "step": 97 + }, + { + "epoch": 0.2016979675842552, + "grad_norm": 0.18206572532653809, + "learning_rate": 1.997709049255441e-05, + "loss": 0.3393, + "step": 98 + }, + { + "epoch": 0.20375611011062517, + "grad_norm": 0.20384562015533447, + "learning_rate": 1.9954180985108823e-05, + "loss": 0.3395, + "step": 99 + }, + { + "epoch": 0.2058142526369951, + "grad_norm": 0.19944581389427185, + "learning_rate": 1.9931271477663232e-05, + "loss": 0.3268, + "step": 100 + }, + { + "epoch": 0.2058142526369951, + "eval_loss": 0.3456890285015106, + "eval_runtime": 2114.0178, + "eval_samples_per_second": 3.677, + "eval_steps_per_second": 0.92, + "step": 100 + }, + { + "epoch": 0.20787239516336506, + "grad_norm": 0.17743557691574097, + "learning_rate": 1.990836197021764e-05, + "loss": 0.3439, + "step": 101 + }, + { + "epoch": 0.20993053768973502, + "grad_norm": 0.18746449053287506, + "learning_rate": 1.9885452462772053e-05, + "loss": 0.326, + "step": 102 + }, + { + "epoch": 0.21198868021610495, + "grad_norm": 0.18555815517902374, + "learning_rate": 1.9862542955326462e-05, + "loss": 0.3337, + "step": 103 + }, + { + "epoch": 0.2140468227424749, + "grad_norm": 0.16591575741767883, + "learning_rate": 1.9839633447880874e-05, + "loss": 0.3121, + "step": 104 + }, + { + "epoch": 0.21610496526884487, + "grad_norm": 0.1621987372636795, + "learning_rate": 1.9816723940435283e-05, + "loss": 0.3287, + "step": 105 + }, + { + "epoch": 0.2181631077952148, + "grad_norm": 0.1614532470703125, + "learning_rate": 1.9793814432989692e-05, + "loss": 0.3306, + "step": 106 + }, + { + "epoch": 0.22022125032158477, + "grad_norm": 0.17993387579917908, + "learning_rate": 1.9770904925544104e-05, + "loss": 0.3341, + "step": 107 + }, + { + "epoch": 0.22227939284795473, + "grad_norm": 0.1550011783838272, + "learning_rate": 1.9747995418098513e-05, + "loss": 0.3197, + "step": 108 + }, + { + "epoch": 0.22433753537432466, + "grad_norm": 0.18471524119377136, + "learning_rate": 1.9725085910652922e-05, + "loss": 0.3285, + "step": 109 + }, + { + "epoch": 0.22639567790069462, + "grad_norm": 0.15604373812675476, + "learning_rate": 1.9702176403207334e-05, + "loss": 0.3298, + "step": 110 + }, + { + "epoch": 0.22845382042706458, + "grad_norm": 0.1682298630475998, + "learning_rate": 1.9679266895761743e-05, + "loss": 0.3343, + "step": 111 + }, + { + "epoch": 0.2305119629534345, + "grad_norm": 0.14933635294437408, + "learning_rate": 1.9656357388316152e-05, + "loss": 0.3134, + "step": 112 + }, + { + "epoch": 0.23257010547980447, + "grad_norm": 0.14892347157001495, + "learning_rate": 1.963344788087056e-05, + "loss": 0.3154, + "step": 113 + }, + { + "epoch": 0.23462824800617443, + "grad_norm": 0.1577889323234558, + "learning_rate": 1.9610538373424973e-05, + "loss": 0.3122, + "step": 114 + }, + { + "epoch": 0.23668639053254437, + "grad_norm": 0.16482344269752502, + "learning_rate": 1.9587628865979382e-05, + "loss": 0.3193, + "step": 115 + }, + { + "epoch": 0.23874453305891433, + "grad_norm": 0.15328913927078247, + "learning_rate": 1.956471935853379e-05, + "loss": 0.3217, + "step": 116 + }, + { + "epoch": 0.2408026755852843, + "grad_norm": 0.16140656173229218, + "learning_rate": 1.9541809851088203e-05, + "loss": 0.318, + "step": 117 + }, + { + "epoch": 0.24286081811165422, + "grad_norm": 0.15448373556137085, + "learning_rate": 1.9518900343642612e-05, + "loss": 0.3205, + "step": 118 + }, + { + "epoch": 0.24491896063802418, + "grad_norm": 0.14716887474060059, + "learning_rate": 1.9495990836197025e-05, + "loss": 0.3164, + "step": 119 + }, + { + "epoch": 0.24697710316439414, + "grad_norm": 0.16582027077674866, + "learning_rate": 1.9473081328751433e-05, + "loss": 0.3191, + "step": 120 + }, + { + "epoch": 0.24903524569076407, + "grad_norm": 0.15213699638843536, + "learning_rate": 1.9450171821305842e-05, + "loss": 0.304, + "step": 121 + }, + { + "epoch": 0.25109338821713406, + "grad_norm": 0.1659238487482071, + "learning_rate": 1.9427262313860255e-05, + "loss": 0.3184, + "step": 122 + }, + { + "epoch": 0.253151530743504, + "grad_norm": 0.15596656501293182, + "learning_rate": 1.9404352806414663e-05, + "loss": 0.3092, + "step": 123 + }, + { + "epoch": 0.2552096732698739, + "grad_norm": 0.15868476033210754, + "learning_rate": 1.9381443298969072e-05, + "loss": 0.3163, + "step": 124 + }, + { + "epoch": 0.2572678157962439, + "grad_norm": 0.15386095643043518, + "learning_rate": 1.9358533791523485e-05, + "loss": 0.3049, + "step": 125 + }, + { + "epoch": 0.25932595832261385, + "grad_norm": 0.15179213881492615, + "learning_rate": 1.9335624284077894e-05, + "loss": 0.3131, + "step": 126 + }, + { + "epoch": 0.2613841008489838, + "grad_norm": 0.1595134735107422, + "learning_rate": 1.9312714776632306e-05, + "loss": 0.3069, + "step": 127 + }, + { + "epoch": 0.26344224337535377, + "grad_norm": 0.16989803314208984, + "learning_rate": 1.9289805269186715e-05, + "loss": 0.3052, + "step": 128 + }, + { + "epoch": 0.2655003859017237, + "grad_norm": 0.14803892374038696, + "learning_rate": 1.9266895761741124e-05, + "loss": 0.3065, + "step": 129 + }, + { + "epoch": 0.26755852842809363, + "grad_norm": 0.16676583886146545, + "learning_rate": 1.9243986254295536e-05, + "loss": 0.2962, + "step": 130 + }, + { + "epoch": 0.2696166709544636, + "grad_norm": 0.15694552659988403, + "learning_rate": 1.9221076746849945e-05, + "loss": 0.3096, + "step": 131 + }, + { + "epoch": 0.27167481348083355, + "grad_norm": 0.17696696519851685, + "learning_rate": 1.9198167239404354e-05, + "loss": 0.3145, + "step": 132 + }, + { + "epoch": 0.2737329560072035, + "grad_norm": 0.17204038798809052, + "learning_rate": 1.9175257731958766e-05, + "loss": 0.3248, + "step": 133 + }, + { + "epoch": 0.2757910985335735, + "grad_norm": 0.15630359947681427, + "learning_rate": 1.9152348224513175e-05, + "loss": 0.3117, + "step": 134 + }, + { + "epoch": 0.2778492410599434, + "grad_norm": 0.15757997334003448, + "learning_rate": 1.9129438717067584e-05, + "loss": 0.3145, + "step": 135 + }, + { + "epoch": 0.27990738358631334, + "grad_norm": 0.16273653507232666, + "learning_rate": 1.9106529209621996e-05, + "loss": 0.3159, + "step": 136 + }, + { + "epoch": 0.28196552611268333, + "grad_norm": 0.16213104128837585, + "learning_rate": 1.9083619702176405e-05, + "loss": 0.2949, + "step": 137 + }, + { + "epoch": 0.28402366863905326, + "grad_norm": 0.15377865731716156, + "learning_rate": 1.9060710194730814e-05, + "loss": 0.306, + "step": 138 + }, + { + "epoch": 0.2860818111654232, + "grad_norm": 0.1545962244272232, + "learning_rate": 1.9037800687285223e-05, + "loss": 0.2966, + "step": 139 + }, + { + "epoch": 0.2881399536917932, + "grad_norm": 0.15516617894172668, + "learning_rate": 1.9014891179839635e-05, + "loss": 0.3122, + "step": 140 + }, + { + "epoch": 0.2901980962181631, + "grad_norm": 0.14734458923339844, + "learning_rate": 1.8991981672394044e-05, + "loss": 0.3118, + "step": 141 + }, + { + "epoch": 0.29225623874453305, + "grad_norm": 0.1644304096698761, + "learning_rate": 1.8969072164948453e-05, + "loss": 0.3027, + "step": 142 + }, + { + "epoch": 0.29431438127090304, + "grad_norm": 0.14632569253444672, + "learning_rate": 1.8946162657502865e-05, + "loss": 0.3023, + "step": 143 + }, + { + "epoch": 0.29637252379727297, + "grad_norm": 0.1573137789964676, + "learning_rate": 1.8923253150057274e-05, + "loss": 0.3102, + "step": 144 + }, + { + "epoch": 0.2984306663236429, + "grad_norm": 0.16423144936561584, + "learning_rate": 1.8900343642611686e-05, + "loss": 0.3033, + "step": 145 + }, + { + "epoch": 0.3004888088500129, + "grad_norm": 0.15420907735824585, + "learning_rate": 1.8877434135166095e-05, + "loss": 0.3089, + "step": 146 + }, + { + "epoch": 0.3025469513763828, + "grad_norm": 0.1579178273677826, + "learning_rate": 1.8854524627720504e-05, + "loss": 0.3071, + "step": 147 + }, + { + "epoch": 0.30460509390275275, + "grad_norm": 0.15866397321224213, + "learning_rate": 1.8831615120274916e-05, + "loss": 0.3083, + "step": 148 + }, + { + "epoch": 0.30666323642912274, + "grad_norm": 0.16651487350463867, + "learning_rate": 1.8808705612829325e-05, + "loss": 0.3099, + "step": 149 + }, + { + "epoch": 0.3087213789554927, + "grad_norm": 0.16281908750534058, + "learning_rate": 1.8785796105383734e-05, + "loss": 0.3034, + "step": 150 + }, + { + "epoch": 0.3107795214818626, + "grad_norm": 0.17449837923049927, + "learning_rate": 1.8762886597938147e-05, + "loss": 0.3054, + "step": 151 + }, + { + "epoch": 0.3128376640082326, + "grad_norm": 0.15403546392917633, + "learning_rate": 1.8739977090492555e-05, + "loss": 0.297, + "step": 152 + }, + { + "epoch": 0.31489580653460253, + "grad_norm": 0.1472466140985489, + "learning_rate": 1.8717067583046968e-05, + "loss": 0.2973, + "step": 153 + }, + { + "epoch": 0.31695394906097246, + "grad_norm": 0.16027937829494476, + "learning_rate": 1.8694158075601377e-05, + "loss": 0.3054, + "step": 154 + }, + { + "epoch": 0.31901209158734245, + "grad_norm": 0.17086225748062134, + "learning_rate": 1.8671248568155786e-05, + "loss": 0.307, + "step": 155 + }, + { + "epoch": 0.3210702341137124, + "grad_norm": 0.15930697321891785, + "learning_rate": 1.8648339060710198e-05, + "loss": 0.293, + "step": 156 + }, + { + "epoch": 0.3231283766400823, + "grad_norm": 0.17086376249790192, + "learning_rate": 1.8625429553264607e-05, + "loss": 0.293, + "step": 157 + }, + { + "epoch": 0.3251865191664523, + "grad_norm": 0.15970875322818756, + "learning_rate": 1.8602520045819016e-05, + "loss": 0.3083, + "step": 158 + }, + { + "epoch": 0.32724466169282224, + "grad_norm": 0.16355909407138824, + "learning_rate": 1.8579610538373428e-05, + "loss": 0.3139, + "step": 159 + }, + { + "epoch": 0.32930280421919217, + "grad_norm": 0.15183711051940918, + "learning_rate": 1.8556701030927837e-05, + "loss": 0.2953, + "step": 160 + }, + { + "epoch": 0.33136094674556216, + "grad_norm": 0.15123715996742249, + "learning_rate": 1.853379152348225e-05, + "loss": 0.3025, + "step": 161 + }, + { + "epoch": 0.3334190892719321, + "grad_norm": 0.1576143503189087, + "learning_rate": 1.8510882016036658e-05, + "loss": 0.2904, + "step": 162 + }, + { + "epoch": 0.335477231798302, + "grad_norm": 0.1457504779100418, + "learning_rate": 1.8487972508591067e-05, + "loss": 0.2909, + "step": 163 + }, + { + "epoch": 0.337535374324672, + "grad_norm": 0.1557442992925644, + "learning_rate": 1.846506300114548e-05, + "loss": 0.3027, + "step": 164 + }, + { + "epoch": 0.33959351685104194, + "grad_norm": 0.15662318468093872, + "learning_rate": 1.8442153493699888e-05, + "loss": 0.311, + "step": 165 + }, + { + "epoch": 0.3416516593774119, + "grad_norm": 0.16177058219909668, + "learning_rate": 1.8419243986254297e-05, + "loss": 0.2944, + "step": 166 + }, + { + "epoch": 0.34370980190378186, + "grad_norm": 0.16406729817390442, + "learning_rate": 1.8396334478808706e-05, + "loss": 0.2927, + "step": 167 + }, + { + "epoch": 0.3457679444301518, + "grad_norm": 0.16642791032791138, + "learning_rate": 1.8373424971363115e-05, + "loss": 0.3063, + "step": 168 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.1650693714618683, + "learning_rate": 1.8350515463917527e-05, + "loss": 0.2957, + "step": 169 + }, + { + "epoch": 0.3498842294828917, + "grad_norm": 0.15349675714969635, + "learning_rate": 1.8327605956471936e-05, + "loss": 0.297, + "step": 170 + }, + { + "epoch": 0.35194237200926165, + "grad_norm": 0.17770209908485413, + "learning_rate": 1.8304696449026348e-05, + "loss": 0.3011, + "step": 171 + }, + { + "epoch": 0.3540005145356316, + "grad_norm": 0.1647631675004959, + "learning_rate": 1.8281786941580757e-05, + "loss": 0.2962, + "step": 172 + }, + { + "epoch": 0.35605865706200157, + "grad_norm": 0.1603834480047226, + "learning_rate": 1.8258877434135166e-05, + "loss": 0.2937, + "step": 173 + }, + { + "epoch": 0.3581167995883715, + "grad_norm": 0.16780880093574524, + "learning_rate": 1.8235967926689578e-05, + "loss": 0.2997, + "step": 174 + }, + { + "epoch": 0.36017494211474144, + "grad_norm": 0.15976767241954803, + "learning_rate": 1.8213058419243987e-05, + "loss": 0.3043, + "step": 175 + }, + { + "epoch": 0.3622330846411114, + "grad_norm": 0.16236485540866852, + "learning_rate": 1.8190148911798396e-05, + "loss": 0.3069, + "step": 176 + }, + { + "epoch": 0.36429122716748136, + "grad_norm": 0.16391968727111816, + "learning_rate": 1.816723940435281e-05, + "loss": 0.2923, + "step": 177 + }, + { + "epoch": 0.3663493696938513, + "grad_norm": 0.15806889533996582, + "learning_rate": 1.8144329896907217e-05, + "loss": 0.2872, + "step": 178 + }, + { + "epoch": 0.3684075122202212, + "grad_norm": 0.1627352088689804, + "learning_rate": 1.812142038946163e-05, + "loss": 0.3032, + "step": 179 + }, + { + "epoch": 0.3704656547465912, + "grad_norm": 0.15103371441364288, + "learning_rate": 1.809851088201604e-05, + "loss": 0.2847, + "step": 180 + }, + { + "epoch": 0.37252379727296114, + "grad_norm": 0.15178488194942474, + "learning_rate": 1.8075601374570447e-05, + "loss": 0.3017, + "step": 181 + }, + { + "epoch": 0.3745819397993311, + "grad_norm": 0.15493899583816528, + "learning_rate": 1.805269186712486e-05, + "loss": 0.2901, + "step": 182 + }, + { + "epoch": 0.37664008232570106, + "grad_norm": 0.15990686416625977, + "learning_rate": 1.802978235967927e-05, + "loss": 0.2861, + "step": 183 + }, + { + "epoch": 0.378698224852071, + "grad_norm": 0.15824148058891296, + "learning_rate": 1.8006872852233677e-05, + "loss": 0.2885, + "step": 184 + }, + { + "epoch": 0.38075636737844093, + "grad_norm": 0.15690775215625763, + "learning_rate": 1.798396334478809e-05, + "loss": 0.2814, + "step": 185 + }, + { + "epoch": 0.3828145099048109, + "grad_norm": 0.15833796560764313, + "learning_rate": 1.79610538373425e-05, + "loss": 0.2847, + "step": 186 + }, + { + "epoch": 0.38487265243118085, + "grad_norm": 0.16560044884681702, + "learning_rate": 1.793814432989691e-05, + "loss": 0.3061, + "step": 187 + }, + { + "epoch": 0.3869307949575508, + "grad_norm": 0.16240179538726807, + "learning_rate": 1.791523482245132e-05, + "loss": 0.2943, + "step": 188 + }, + { + "epoch": 0.38898893748392077, + "grad_norm": 0.15825721621513367, + "learning_rate": 1.789232531500573e-05, + "loss": 0.2934, + "step": 189 + }, + { + "epoch": 0.3910470800102907, + "grad_norm": 0.16665388643741608, + "learning_rate": 1.786941580756014e-05, + "loss": 0.291, + "step": 190 + }, + { + "epoch": 0.39310522253666064, + "grad_norm": 0.16581200063228607, + "learning_rate": 1.784650630011455e-05, + "loss": 0.2849, + "step": 191 + }, + { + "epoch": 0.3951633650630306, + "grad_norm": 0.1604345291852951, + "learning_rate": 1.782359679266896e-05, + "loss": 0.3, + "step": 192 + }, + { + "epoch": 0.39722150758940056, + "grad_norm": 0.16107915341854095, + "learning_rate": 1.7800687285223368e-05, + "loss": 0.2847, + "step": 193 + }, + { + "epoch": 0.3992796501157705, + "grad_norm": 0.1571730375289917, + "learning_rate": 1.7777777777777777e-05, + "loss": 0.2863, + "step": 194 + }, + { + "epoch": 0.4013377926421405, + "grad_norm": 0.1656399518251419, + "learning_rate": 1.775486827033219e-05, + "loss": 0.2878, + "step": 195 + }, + { + "epoch": 0.4033959351685104, + "grad_norm": 0.16738460958003998, + "learning_rate": 1.7731958762886598e-05, + "loss": 0.286, + "step": 196 + }, + { + "epoch": 0.40545407769488034, + "grad_norm": 0.16704292595386505, + "learning_rate": 1.770904925544101e-05, + "loss": 0.2919, + "step": 197 + }, + { + "epoch": 0.40751222022125033, + "grad_norm": 0.16215579211711884, + "learning_rate": 1.768613974799542e-05, + "loss": 0.2874, + "step": 198 + }, + { + "epoch": 0.40957036274762026, + "grad_norm": 0.15573479235172272, + "learning_rate": 1.7663230240549828e-05, + "loss": 0.2904, + "step": 199 + }, + { + "epoch": 0.4116285052739902, + "grad_norm": 0.1707623153924942, + "learning_rate": 1.764032073310424e-05, + "loss": 0.289, + "step": 200 + }, + { + "epoch": 0.4116285052739902, + "eval_loss": 0.3214050829410553, + "eval_runtime": 2449.7742, + "eval_samples_per_second": 3.173, + "eval_steps_per_second": 0.794, + "step": 200 + }, + { + "epoch": 0.4136866478003602, + "grad_norm": 0.1699172556400299, + "learning_rate": 1.761741122565865e-05, + "loss": 0.2852, + "step": 201 + }, + { + "epoch": 0.4157447903267301, + "grad_norm": 0.19150058925151825, + "learning_rate": 1.7594501718213058e-05, + "loss": 0.29, + "step": 202 + }, + { + "epoch": 0.41780293285310005, + "grad_norm": 0.15794627368450165, + "learning_rate": 1.757159221076747e-05, + "loss": 0.2746, + "step": 203 + }, + { + "epoch": 0.41986107537947004, + "grad_norm": 0.17305190861225128, + "learning_rate": 1.754868270332188e-05, + "loss": 0.3003, + "step": 204 + }, + { + "epoch": 0.42191921790583997, + "grad_norm": 0.16257523000240326, + "learning_rate": 1.752577319587629e-05, + "loss": 0.2789, + "step": 205 + }, + { + "epoch": 0.4239773604322099, + "grad_norm": 0.17273619771003723, + "learning_rate": 1.75028636884307e-05, + "loss": 0.2917, + "step": 206 + }, + { + "epoch": 0.4260355029585799, + "grad_norm": 0.17502790689468384, + "learning_rate": 1.747995418098511e-05, + "loss": 0.2992, + "step": 207 + }, + { + "epoch": 0.4280936454849498, + "grad_norm": 0.16464050114154816, + "learning_rate": 1.745704467353952e-05, + "loss": 0.2873, + "step": 208 + }, + { + "epoch": 0.43015178801131976, + "grad_norm": 0.1681668758392334, + "learning_rate": 1.743413516609393e-05, + "loss": 0.2991, + "step": 209 + }, + { + "epoch": 0.43220993053768975, + "grad_norm": 0.16957956552505493, + "learning_rate": 1.741122565864834e-05, + "loss": 0.2868, + "step": 210 + }, + { + "epoch": 0.4342680730640597, + "grad_norm": 0.15875883400440216, + "learning_rate": 1.738831615120275e-05, + "loss": 0.2946, + "step": 211 + }, + { + "epoch": 0.4363262155904296, + "grad_norm": 0.18127889931201935, + "learning_rate": 1.736540664375716e-05, + "loss": 0.2835, + "step": 212 + }, + { + "epoch": 0.4383843581167996, + "grad_norm": 0.17822811007499695, + "learning_rate": 1.7342497136311573e-05, + "loss": 0.2944, + "step": 213 + }, + { + "epoch": 0.44044250064316953, + "grad_norm": 0.17555806040763855, + "learning_rate": 1.731958762886598e-05, + "loss": 0.3001, + "step": 214 + }, + { + "epoch": 0.44250064316953946, + "grad_norm": 0.18709121644496918, + "learning_rate": 1.729667812142039e-05, + "loss": 0.282, + "step": 215 + }, + { + "epoch": 0.44455878569590945, + "grad_norm": 0.16322475671768188, + "learning_rate": 1.7273768613974803e-05, + "loss": 0.2883, + "step": 216 + }, + { + "epoch": 0.4466169282222794, + "grad_norm": 0.1677054911851883, + "learning_rate": 1.7250859106529212e-05, + "loss": 0.28, + "step": 217 + }, + { + "epoch": 0.4486750707486493, + "grad_norm": 0.15764063596725464, + "learning_rate": 1.722794959908362e-05, + "loss": 0.2768, + "step": 218 + }, + { + "epoch": 0.4507332132750193, + "grad_norm": 0.16166841983795166, + "learning_rate": 1.7205040091638033e-05, + "loss": 0.2868, + "step": 219 + }, + { + "epoch": 0.45279135580138924, + "grad_norm": 0.1799350380897522, + "learning_rate": 1.7182130584192442e-05, + "loss": 0.2891, + "step": 220 + }, + { + "epoch": 0.45484949832775917, + "grad_norm": 0.18119174242019653, + "learning_rate": 1.715922107674685e-05, + "loss": 0.2841, + "step": 221 + }, + { + "epoch": 0.45690764085412916, + "grad_norm": 0.17725548148155212, + "learning_rate": 1.713631156930126e-05, + "loss": 0.3038, + "step": 222 + }, + { + "epoch": 0.4589657833804991, + "grad_norm": 0.1628233790397644, + "learning_rate": 1.7113402061855672e-05, + "loss": 0.2868, + "step": 223 + }, + { + "epoch": 0.461023925906869, + "grad_norm": 0.1745166927576065, + "learning_rate": 1.709049255441008e-05, + "loss": 0.3033, + "step": 224 + }, + { + "epoch": 0.463082068433239, + "grad_norm": 0.17708267271518707, + "learning_rate": 1.706758304696449e-05, + "loss": 0.2842, + "step": 225 + }, + { + "epoch": 0.46514021095960895, + "grad_norm": 0.1738453358411789, + "learning_rate": 1.7044673539518902e-05, + "loss": 0.3005, + "step": 226 + }, + { + "epoch": 0.4671983534859789, + "grad_norm": 0.1706874966621399, + "learning_rate": 1.702176403207331e-05, + "loss": 0.2924, + "step": 227 + }, + { + "epoch": 0.46925649601234887, + "grad_norm": 0.1697423756122589, + "learning_rate": 1.699885452462772e-05, + "loss": 0.2783, + "step": 228 + }, + { + "epoch": 0.4713146385387188, + "grad_norm": 0.1783403754234314, + "learning_rate": 1.6975945017182132e-05, + "loss": 0.2924, + "step": 229 + }, + { + "epoch": 0.47337278106508873, + "grad_norm": 0.17431536316871643, + "learning_rate": 1.695303550973654e-05, + "loss": 0.2792, + "step": 230 + }, + { + "epoch": 0.4754309235914587, + "grad_norm": 0.164026141166687, + "learning_rate": 1.6930126002290953e-05, + "loss": 0.2825, + "step": 231 + }, + { + "epoch": 0.47748906611782865, + "grad_norm": 0.16449657082557678, + "learning_rate": 1.6907216494845362e-05, + "loss": 0.2831, + "step": 232 + }, + { + "epoch": 0.4795472086441986, + "grad_norm": 0.1812741607427597, + "learning_rate": 1.688430698739977e-05, + "loss": 0.2849, + "step": 233 + }, + { + "epoch": 0.4816053511705686, + "grad_norm": 0.18431834876537323, + "learning_rate": 1.6861397479954183e-05, + "loss": 0.2802, + "step": 234 + }, + { + "epoch": 0.4836634936969385, + "grad_norm": 0.18349015712738037, + "learning_rate": 1.6838487972508592e-05, + "loss": 0.2804, + "step": 235 + }, + { + "epoch": 0.48572163622330844, + "grad_norm": 0.1769968420267105, + "learning_rate": 1.6815578465063e-05, + "loss": 0.2777, + "step": 236 + }, + { + "epoch": 0.4877797787496784, + "grad_norm": 0.17207500338554382, + "learning_rate": 1.6792668957617413e-05, + "loss": 0.2883, + "step": 237 + }, + { + "epoch": 0.48983792127604836, + "grad_norm": 0.1729692667722702, + "learning_rate": 1.6769759450171822e-05, + "loss": 0.2784, + "step": 238 + }, + { + "epoch": 0.4918960638024183, + "grad_norm": 0.17234881222248077, + "learning_rate": 1.6746849942726235e-05, + "loss": 0.2816, + "step": 239 + }, + { + "epoch": 0.4939542063287883, + "grad_norm": 0.17132551968097687, + "learning_rate": 1.6723940435280644e-05, + "loss": 0.2812, + "step": 240 + }, + { + "epoch": 0.4960123488551582, + "grad_norm": 0.1752254068851471, + "learning_rate": 1.6701030927835052e-05, + "loss": 0.2799, + "step": 241 + }, + { + "epoch": 0.49807049138152815, + "grad_norm": 0.1768665313720703, + "learning_rate": 1.6678121420389465e-05, + "loss": 0.2966, + "step": 242 + }, + { + "epoch": 0.5001286339078981, + "grad_norm": 0.18139514327049255, + "learning_rate": 1.6655211912943874e-05, + "loss": 0.2816, + "step": 243 + }, + { + "epoch": 0.5021867764342681, + "grad_norm": 0.17312943935394287, + "learning_rate": 1.6632302405498283e-05, + "loss": 0.2845, + "step": 244 + }, + { + "epoch": 0.5042449189606381, + "grad_norm": 0.17966389656066895, + "learning_rate": 1.6609392898052695e-05, + "loss": 0.2864, + "step": 245 + }, + { + "epoch": 0.506303061487008, + "grad_norm": 0.16653811931610107, + "learning_rate": 1.6586483390607104e-05, + "loss": 0.2759, + "step": 246 + }, + { + "epoch": 0.5083612040133779, + "grad_norm": 0.1634613424539566, + "learning_rate": 1.6563573883161516e-05, + "loss": 0.2728, + "step": 247 + }, + { + "epoch": 0.5104193465397479, + "grad_norm": 0.17358507215976715, + "learning_rate": 1.654066437571592e-05, + "loss": 0.2706, + "step": 248 + }, + { + "epoch": 0.5124774890661178, + "grad_norm": 0.17524316906929016, + "learning_rate": 1.6517754868270334e-05, + "loss": 0.2805, + "step": 249 + }, + { + "epoch": 0.5145356315924878, + "grad_norm": 0.18134094774723053, + "learning_rate": 1.6494845360824743e-05, + "loss": 0.2909, + "step": 250 + }, + { + "epoch": 0.5165937741188578, + "grad_norm": 0.17795510590076447, + "learning_rate": 1.647193585337915e-05, + "loss": 0.2889, + "step": 251 + }, + { + "epoch": 0.5186519166452277, + "grad_norm": 0.16782547533512115, + "learning_rate": 1.6449026345933564e-05, + "loss": 0.2842, + "step": 252 + }, + { + "epoch": 0.5207100591715976, + "grad_norm": 0.17360062897205353, + "learning_rate": 1.6426116838487973e-05, + "loss": 0.2763, + "step": 253 + }, + { + "epoch": 0.5227682016979676, + "grad_norm": 0.17241406440734863, + "learning_rate": 1.6403207331042385e-05, + "loss": 0.2753, + "step": 254 + }, + { + "epoch": 0.5248263442243375, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.6380297823596794e-05, + "loss": 0.2732, + "step": 255 + }, + { + "epoch": 0.5268844867507075, + "grad_norm": 0.1807374209165573, + "learning_rate": 1.6357388316151203e-05, + "loss": 0.2856, + "step": 256 + }, + { + "epoch": 0.5289426292770775, + "grad_norm": 0.1749904304742813, + "learning_rate": 1.6334478808705615e-05, + "loss": 0.285, + "step": 257 + }, + { + "epoch": 0.5310007718034474, + "grad_norm": 0.16673170030117035, + "learning_rate": 1.6311569301260024e-05, + "loss": 0.2825, + "step": 258 + }, + { + "epoch": 0.5330589143298173, + "grad_norm": 0.17239685356616974, + "learning_rate": 1.6288659793814433e-05, + "loss": 0.2845, + "step": 259 + }, + { + "epoch": 0.5351170568561873, + "grad_norm": 0.1831504851579666, + "learning_rate": 1.6265750286368845e-05, + "loss": 0.2859, + "step": 260 + }, + { + "epoch": 0.5371751993825572, + "grad_norm": 0.18507827818393707, + "learning_rate": 1.6242840778923254e-05, + "loss": 0.293, + "step": 261 + }, + { + "epoch": 0.5392333419089272, + "grad_norm": 0.16738134622573853, + "learning_rate": 1.6219931271477663e-05, + "loss": 0.2853, + "step": 262 + }, + { + "epoch": 0.5412914844352972, + "grad_norm": 0.1701226830482483, + "learning_rate": 1.6197021764032075e-05, + "loss": 0.2763, + "step": 263 + }, + { + "epoch": 0.5433496269616671, + "grad_norm": 0.18195705115795135, + "learning_rate": 1.6174112256586484e-05, + "loss": 0.2797, + "step": 264 + }, + { + "epoch": 0.545407769488037, + "grad_norm": 0.1832309514284134, + "learning_rate": 1.6151202749140896e-05, + "loss": 0.2885, + "step": 265 + }, + { + "epoch": 0.547465912014407, + "grad_norm": 0.1773810088634491, + "learning_rate": 1.6128293241695305e-05, + "loss": 0.2682, + "step": 266 + }, + { + "epoch": 0.5495240545407769, + "grad_norm": 0.16989603638648987, + "learning_rate": 1.6105383734249714e-05, + "loss": 0.2821, + "step": 267 + }, + { + "epoch": 0.551582197067147, + "grad_norm": 0.17835170030593872, + "learning_rate": 1.6082474226804127e-05, + "loss": 0.2774, + "step": 268 + }, + { + "epoch": 0.5536403395935169, + "grad_norm": 0.1777082234621048, + "learning_rate": 1.6059564719358535e-05, + "loss": 0.2726, + "step": 269 + }, + { + "epoch": 0.5556984821198868, + "grad_norm": 0.18766450881958008, + "learning_rate": 1.6036655211912944e-05, + "loss": 0.2879, + "step": 270 + }, + { + "epoch": 0.5577566246462567, + "grad_norm": 0.1868186593055725, + "learning_rate": 1.6013745704467357e-05, + "loss": 0.2808, + "step": 271 + }, + { + "epoch": 0.5598147671726267, + "grad_norm": 0.16695882380008698, + "learning_rate": 1.5990836197021766e-05, + "loss": 0.2668, + "step": 272 + }, + { + "epoch": 0.5618729096989966, + "grad_norm": 0.17224495112895966, + "learning_rate": 1.5967926689576178e-05, + "loss": 0.2682, + "step": 273 + }, + { + "epoch": 0.5639310522253667, + "grad_norm": 0.20116423070430756, + "learning_rate": 1.5945017182130587e-05, + "loss": 0.276, + "step": 274 + }, + { + "epoch": 0.5659891947517366, + "grad_norm": 0.19478343427181244, + "learning_rate": 1.5922107674684996e-05, + "loss": 0.2854, + "step": 275 + }, + { + "epoch": 0.5680473372781065, + "grad_norm": 0.20242950320243835, + "learning_rate": 1.5899198167239405e-05, + "loss": 0.2854, + "step": 276 + }, + { + "epoch": 0.5701054798044765, + "grad_norm": 0.19146093726158142, + "learning_rate": 1.5876288659793813e-05, + "loss": 0.2817, + "step": 277 + }, + { + "epoch": 0.5721636223308464, + "grad_norm": 0.1804896742105484, + "learning_rate": 1.5853379152348226e-05, + "loss": 0.2714, + "step": 278 + }, + { + "epoch": 0.5742217648572163, + "grad_norm": 0.19315646588802338, + "learning_rate": 1.5830469644902635e-05, + "loss": 0.2703, + "step": 279 + }, + { + "epoch": 0.5762799073835864, + "grad_norm": 0.1910266876220703, + "learning_rate": 1.5807560137457047e-05, + "loss": 0.2728, + "step": 280 + }, + { + "epoch": 0.5783380499099563, + "grad_norm": 0.20330773293972015, + "learning_rate": 1.5784650630011456e-05, + "loss": 0.2717, + "step": 281 + }, + { + "epoch": 0.5803961924363262, + "grad_norm": 0.19080683588981628, + "learning_rate": 1.5761741122565865e-05, + "loss": 0.2679, + "step": 282 + }, + { + "epoch": 0.5824543349626962, + "grad_norm": 0.18052135407924652, + "learning_rate": 1.5738831615120277e-05, + "loss": 0.2815, + "step": 283 + }, + { + "epoch": 0.5845124774890661, + "grad_norm": 0.1998361051082611, + "learning_rate": 1.5715922107674686e-05, + "loss": 0.2888, + "step": 284 + }, + { + "epoch": 0.586570620015436, + "grad_norm": 0.1978764683008194, + "learning_rate": 1.5693012600229095e-05, + "loss": 0.2926, + "step": 285 + }, + { + "epoch": 0.5886287625418061, + "grad_norm": 0.17189203202724457, + "learning_rate": 1.5670103092783507e-05, + "loss": 0.2674, + "step": 286 + }, + { + "epoch": 0.590686905068176, + "grad_norm": 0.1937166303396225, + "learning_rate": 1.5647193585337916e-05, + "loss": 0.2838, + "step": 287 + }, + { + "epoch": 0.5927450475945459, + "grad_norm": 0.18978627026081085, + "learning_rate": 1.5624284077892328e-05, + "loss": 0.273, + "step": 288 + }, + { + "epoch": 0.5948031901209159, + "grad_norm": 0.17718705534934998, + "learning_rate": 1.5601374570446737e-05, + "loss": 0.2842, + "step": 289 + }, + { + "epoch": 0.5968613326472858, + "grad_norm": 0.1912536770105362, + "learning_rate": 1.5578465063001146e-05, + "loss": 0.2736, + "step": 290 + }, + { + "epoch": 0.5989194751736557, + "grad_norm": 0.18104907870292664, + "learning_rate": 1.555555555555556e-05, + "loss": 0.274, + "step": 291 + }, + { + "epoch": 0.6009776177000258, + "grad_norm": 0.1620381772518158, + "learning_rate": 1.5532646048109967e-05, + "loss": 0.2663, + "step": 292 + }, + { + "epoch": 0.6030357602263957, + "grad_norm": 0.17973916232585907, + "learning_rate": 1.5509736540664376e-05, + "loss": 0.2791, + "step": 293 + }, + { + "epoch": 0.6050939027527656, + "grad_norm": 0.16821186244487762, + "learning_rate": 1.548682703321879e-05, + "loss": 0.2787, + "step": 294 + }, + { + "epoch": 0.6071520452791356, + "grad_norm": 0.18426693975925446, + "learning_rate": 1.5463917525773197e-05, + "loss": 0.2886, + "step": 295 + }, + { + "epoch": 0.6092101878055055, + "grad_norm": 0.19796033203601837, + "learning_rate": 1.5441008018327606e-05, + "loss": 0.268, + "step": 296 + }, + { + "epoch": 0.6112683303318754, + "grad_norm": 0.1971343755722046, + "learning_rate": 1.541809851088202e-05, + "loss": 0.2761, + "step": 297 + }, + { + "epoch": 0.6133264728582455, + "grad_norm": 0.17458567023277283, + "learning_rate": 1.5395189003436427e-05, + "loss": 0.2831, + "step": 298 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.17610400915145874, + "learning_rate": 1.537227949599084e-05, + "loss": 0.2691, + "step": 299 + }, + { + "epoch": 0.6174427579109854, + "grad_norm": 0.1929042488336563, + "learning_rate": 1.534936998854525e-05, + "loss": 0.2847, + "step": 300 + }, + { + "epoch": 0.6174427579109854, + "eval_loss": 0.2959522604942322, + "eval_runtime": 2428.6339, + "eval_samples_per_second": 3.201, + "eval_steps_per_second": 0.8, + "step": 300 + }, + { + "epoch": 0.6195009004373553, + "grad_norm": 0.19430233538150787, + "learning_rate": 1.5326460481099657e-05, + "loss": 0.279, + "step": 301 + }, + { + "epoch": 0.6215590429637252, + "grad_norm": 0.18542642891407013, + "learning_rate": 1.5303550973654066e-05, + "loss": 0.2695, + "step": 302 + }, + { + "epoch": 0.6236171854900951, + "grad_norm": 0.1850169450044632, + "learning_rate": 1.5280641466208475e-05, + "loss": 0.2847, + "step": 303 + }, + { + "epoch": 0.6256753280164652, + "grad_norm": 0.18449267745018005, + "learning_rate": 1.5257731958762888e-05, + "loss": 0.2804, + "step": 304 + }, + { + "epoch": 0.6277334705428351, + "grad_norm": 0.18608458340168, + "learning_rate": 1.5234822451317296e-05, + "loss": 0.2792, + "step": 305 + }, + { + "epoch": 0.6297916130692051, + "grad_norm": 0.21136076748371124, + "learning_rate": 1.5211912943871707e-05, + "loss": 0.2829, + "step": 306 + }, + { + "epoch": 0.631849755595575, + "grad_norm": 0.19672206044197083, + "learning_rate": 1.5189003436426118e-05, + "loss": 0.2854, + "step": 307 + }, + { + "epoch": 0.6339078981219449, + "grad_norm": 0.1834034025669098, + "learning_rate": 1.5166093928980528e-05, + "loss": 0.2775, + "step": 308 + }, + { + "epoch": 0.6359660406483149, + "grad_norm": 0.18414819240570068, + "learning_rate": 1.5143184421534937e-05, + "loss": 0.2794, + "step": 309 + }, + { + "epoch": 0.6380241831746849, + "grad_norm": 0.1890152245759964, + "learning_rate": 1.5120274914089348e-05, + "loss": 0.2718, + "step": 310 + }, + { + "epoch": 0.6400823257010548, + "grad_norm": 0.18923887610435486, + "learning_rate": 1.5097365406643758e-05, + "loss": 0.2795, + "step": 311 + }, + { + "epoch": 0.6421404682274248, + "grad_norm": 0.20047079026699066, + "learning_rate": 1.5074455899198169e-05, + "loss": 0.2811, + "step": 312 + }, + { + "epoch": 0.6441986107537947, + "grad_norm": 0.1910201609134674, + "learning_rate": 1.5051546391752578e-05, + "loss": 0.2732, + "step": 313 + }, + { + "epoch": 0.6462567532801646, + "grad_norm": 0.2021956443786621, + "learning_rate": 1.5028636884306988e-05, + "loss": 0.2806, + "step": 314 + }, + { + "epoch": 0.6483148958065346, + "grad_norm": 0.18957914412021637, + "learning_rate": 1.5005727376861399e-05, + "loss": 0.2681, + "step": 315 + }, + { + "epoch": 0.6503730383329046, + "grad_norm": 0.19858811795711517, + "learning_rate": 1.498281786941581e-05, + "loss": 0.2805, + "step": 316 + }, + { + "epoch": 0.6524311808592745, + "grad_norm": 0.1731935292482376, + "learning_rate": 1.4959908361970218e-05, + "loss": 0.2646, + "step": 317 + }, + { + "epoch": 0.6544893233856445, + "grad_norm": 0.19619058072566986, + "learning_rate": 1.4936998854524629e-05, + "loss": 0.2965, + "step": 318 + }, + { + "epoch": 0.6565474659120144, + "grad_norm": 0.18745696544647217, + "learning_rate": 1.491408934707904e-05, + "loss": 0.2766, + "step": 319 + }, + { + "epoch": 0.6586056084383843, + "grad_norm": 0.18006449937820435, + "learning_rate": 1.489117983963345e-05, + "loss": 0.2788, + "step": 320 + }, + { + "epoch": 0.6606637509647543, + "grad_norm": 0.17593689262866974, + "learning_rate": 1.486827033218786e-05, + "loss": 0.2813, + "step": 321 + }, + { + "epoch": 0.6627218934911243, + "grad_norm": 0.18695640563964844, + "learning_rate": 1.484536082474227e-05, + "loss": 0.281, + "step": 322 + }, + { + "epoch": 0.6647800360174942, + "grad_norm": 0.17909488081932068, + "learning_rate": 1.482245131729668e-05, + "loss": 0.2814, + "step": 323 + }, + { + "epoch": 0.6668381785438642, + "grad_norm": 0.19074076414108276, + "learning_rate": 1.4799541809851091e-05, + "loss": 0.2721, + "step": 324 + }, + { + "epoch": 0.6688963210702341, + "grad_norm": 0.19175754487514496, + "learning_rate": 1.47766323024055e-05, + "loss": 0.2754, + "step": 325 + }, + { + "epoch": 0.670954463596604, + "grad_norm": 0.18646575510501862, + "learning_rate": 1.475372279495991e-05, + "loss": 0.2678, + "step": 326 + }, + { + "epoch": 0.673012606122974, + "grad_norm": 0.18553243577480316, + "learning_rate": 1.4730813287514321e-05, + "loss": 0.281, + "step": 327 + }, + { + "epoch": 0.675070748649344, + "grad_norm": 0.17120976746082306, + "learning_rate": 1.470790378006873e-05, + "loss": 0.2691, + "step": 328 + }, + { + "epoch": 0.677128891175714, + "grad_norm": 0.19170524179935455, + "learning_rate": 1.4684994272623139e-05, + "loss": 0.2685, + "step": 329 + }, + { + "epoch": 0.6791870337020839, + "grad_norm": 0.1851339191198349, + "learning_rate": 1.466208476517755e-05, + "loss": 0.266, + "step": 330 + }, + { + "epoch": 0.6812451762284538, + "grad_norm": 0.1678062081336975, + "learning_rate": 1.4639175257731958e-05, + "loss": 0.2609, + "step": 331 + }, + { + "epoch": 0.6833033187548238, + "grad_norm": 0.17913252115249634, + "learning_rate": 1.4616265750286369e-05, + "loss": 0.2716, + "step": 332 + }, + { + "epoch": 0.6853614612811937, + "grad_norm": 0.1859239637851715, + "learning_rate": 1.459335624284078e-05, + "loss": 0.2712, + "step": 333 + }, + { + "epoch": 0.6874196038075637, + "grad_norm": 0.18390226364135742, + "learning_rate": 1.457044673539519e-05, + "loss": 0.2827, + "step": 334 + }, + { + "epoch": 0.6894777463339337, + "grad_norm": 0.18520398437976837, + "learning_rate": 1.4547537227949599e-05, + "loss": 0.2721, + "step": 335 + }, + { + "epoch": 0.6915358888603036, + "grad_norm": 0.18416717648506165, + "learning_rate": 1.452462772050401e-05, + "loss": 0.2683, + "step": 336 + }, + { + "epoch": 0.6935940313866735, + "grad_norm": 0.18727894127368927, + "learning_rate": 1.450171821305842e-05, + "loss": 0.2733, + "step": 337 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.18597093224525452, + "learning_rate": 1.447880870561283e-05, + "loss": 0.2708, + "step": 338 + }, + { + "epoch": 0.6977103164394134, + "grad_norm": 0.1786068081855774, + "learning_rate": 1.445589919816724e-05, + "loss": 0.2667, + "step": 339 + }, + { + "epoch": 0.6997684589657834, + "grad_norm": 0.17466600239276886, + "learning_rate": 1.443298969072165e-05, + "loss": 0.2786, + "step": 340 + }, + { + "epoch": 0.7018266014921534, + "grad_norm": 0.185857355594635, + "learning_rate": 1.4410080183276061e-05, + "loss": 0.2759, + "step": 341 + }, + { + "epoch": 0.7038847440185233, + "grad_norm": 0.2004527747631073, + "learning_rate": 1.4387170675830471e-05, + "loss": 0.2847, + "step": 342 + }, + { + "epoch": 0.7059428865448932, + "grad_norm": 0.18774060904979706, + "learning_rate": 1.436426116838488e-05, + "loss": 0.2766, + "step": 343 + }, + { + "epoch": 0.7080010290712632, + "grad_norm": 0.1840328425168991, + "learning_rate": 1.4341351660939291e-05, + "loss": 0.2722, + "step": 344 + }, + { + "epoch": 0.7100591715976331, + "grad_norm": 0.19089624285697937, + "learning_rate": 1.4318442153493702e-05, + "loss": 0.2779, + "step": 345 + }, + { + "epoch": 0.7121173141240031, + "grad_norm": 0.1848018616437912, + "learning_rate": 1.4295532646048112e-05, + "loss": 0.2739, + "step": 346 + }, + { + "epoch": 0.7141754566503731, + "grad_norm": 0.18844038248062134, + "learning_rate": 1.4272623138602521e-05, + "loss": 0.27, + "step": 347 + }, + { + "epoch": 0.716233599176743, + "grad_norm": 0.19289302825927734, + "learning_rate": 1.4249713631156932e-05, + "loss": 0.2743, + "step": 348 + }, + { + "epoch": 0.7182917417031129, + "grad_norm": 0.18738920986652374, + "learning_rate": 1.4226804123711342e-05, + "loss": 0.2657, + "step": 349 + }, + { + "epoch": 0.7203498842294829, + "grad_norm": 0.1925181746482849, + "learning_rate": 1.4203894616265753e-05, + "loss": 0.2637, + "step": 350 + }, + { + "epoch": 0.7224080267558528, + "grad_norm": 0.19114750623703003, + "learning_rate": 1.4180985108820162e-05, + "loss": 0.2758, + "step": 351 + }, + { + "epoch": 0.7244661692822228, + "grad_norm": 0.18310120701789856, + "learning_rate": 1.4158075601374572e-05, + "loss": 0.2777, + "step": 352 + }, + { + "epoch": 0.7265243118085928, + "grad_norm": 0.2045605331659317, + "learning_rate": 1.4135166093928983e-05, + "loss": 0.2653, + "step": 353 + }, + { + "epoch": 0.7285824543349627, + "grad_norm": 0.1856454759836197, + "learning_rate": 1.4112256586483393e-05, + "loss": 0.267, + "step": 354 + }, + { + "epoch": 0.7306405968613326, + "grad_norm": 0.1855366826057434, + "learning_rate": 1.4089347079037802e-05, + "loss": 0.2805, + "step": 355 + }, + { + "epoch": 0.7326987393877026, + "grad_norm": 0.17913414537906647, + "learning_rate": 1.4066437571592213e-05, + "loss": 0.2755, + "step": 356 + }, + { + "epoch": 0.7347568819140725, + "grad_norm": 0.2057684361934662, + "learning_rate": 1.404352806414662e-05, + "loss": 0.2668, + "step": 357 + }, + { + "epoch": 0.7368150244404424, + "grad_norm": 0.190156951546669, + "learning_rate": 1.402061855670103e-05, + "loss": 0.2778, + "step": 358 + }, + { + "epoch": 0.7388731669668125, + "grad_norm": 0.19387219846248627, + "learning_rate": 1.3997709049255441e-05, + "loss": 0.2785, + "step": 359 + }, + { + "epoch": 0.7409313094931824, + "grad_norm": 0.1933836042881012, + "learning_rate": 1.3974799541809852e-05, + "loss": 0.2661, + "step": 360 + }, + { + "epoch": 0.7429894520195524, + "grad_norm": 0.19618812203407288, + "learning_rate": 1.3951890034364261e-05, + "loss": 0.2622, + "step": 361 + }, + { + "epoch": 0.7450475945459223, + "grad_norm": 0.18786942958831787, + "learning_rate": 1.3928980526918671e-05, + "loss": 0.2695, + "step": 362 + }, + { + "epoch": 0.7471057370722922, + "grad_norm": 0.19361330568790436, + "learning_rate": 1.3906071019473082e-05, + "loss": 0.2869, + "step": 363 + }, + { + "epoch": 0.7491638795986622, + "grad_norm": 0.19813291728496552, + "learning_rate": 1.3883161512027493e-05, + "loss": 0.2753, + "step": 364 + }, + { + "epoch": 0.7512220221250322, + "grad_norm": 0.1891734004020691, + "learning_rate": 1.3860252004581902e-05, + "loss": 0.2694, + "step": 365 + }, + { + "epoch": 0.7532801646514021, + "grad_norm": 0.18902742862701416, + "learning_rate": 1.3837342497136312e-05, + "loss": 0.2675, + "step": 366 + }, + { + "epoch": 0.7553383071777721, + "grad_norm": 0.19838480651378632, + "learning_rate": 1.3814432989690723e-05, + "loss": 0.2721, + "step": 367 + }, + { + "epoch": 0.757396449704142, + "grad_norm": 0.20880939066410065, + "learning_rate": 1.3791523482245133e-05, + "loss": 0.2641, + "step": 368 + }, + { + "epoch": 0.7594545922305119, + "grad_norm": 0.20068003237247467, + "learning_rate": 1.3768613974799542e-05, + "loss": 0.2945, + "step": 369 + }, + { + "epoch": 0.7615127347568819, + "grad_norm": 0.19780132174491882, + "learning_rate": 1.3745704467353953e-05, + "loss": 0.2687, + "step": 370 + }, + { + "epoch": 0.7635708772832519, + "grad_norm": 0.19194689393043518, + "learning_rate": 1.3722794959908363e-05, + "loss": 0.2731, + "step": 371 + }, + { + "epoch": 0.7656290198096218, + "grad_norm": 0.19504573941230774, + "learning_rate": 1.3699885452462774e-05, + "loss": 0.2551, + "step": 372 + }, + { + "epoch": 0.7676871623359918, + "grad_norm": 0.18304413557052612, + "learning_rate": 1.3676975945017183e-05, + "loss": 0.2692, + "step": 373 + }, + { + "epoch": 0.7697453048623617, + "grad_norm": 0.2051483392715454, + "learning_rate": 1.3654066437571593e-05, + "loss": 0.2791, + "step": 374 + }, + { + "epoch": 0.7718034473887316, + "grad_norm": 0.18748973309993744, + "learning_rate": 1.3631156930126004e-05, + "loss": 0.2671, + "step": 375 + }, + { + "epoch": 0.7738615899151016, + "grad_norm": 0.19167177379131317, + "learning_rate": 1.3608247422680415e-05, + "loss": 0.2766, + "step": 376 + }, + { + "epoch": 0.7759197324414716, + "grad_norm": 0.17931750416755676, + "learning_rate": 1.3585337915234824e-05, + "loss": 0.2748, + "step": 377 + }, + { + "epoch": 0.7779778749678415, + "grad_norm": 0.19437509775161743, + "learning_rate": 1.3562428407789234e-05, + "loss": 0.2667, + "step": 378 + }, + { + "epoch": 0.7800360174942115, + "grad_norm": 0.19813868403434753, + "learning_rate": 1.3539518900343645e-05, + "loss": 0.2771, + "step": 379 + }, + { + "epoch": 0.7820941600205814, + "grad_norm": 0.19205260276794434, + "learning_rate": 1.3516609392898055e-05, + "loss": 0.2703, + "step": 380 + }, + { + "epoch": 0.7841523025469513, + "grad_norm": 0.19039763510227203, + "learning_rate": 1.3493699885452464e-05, + "loss": 0.264, + "step": 381 + }, + { + "epoch": 0.7862104450733213, + "grad_norm": 0.18269500136375427, + "learning_rate": 1.3470790378006875e-05, + "loss": 0.2653, + "step": 382 + }, + { + "epoch": 0.7882685875996913, + "grad_norm": 0.1922067403793335, + "learning_rate": 1.3447880870561285e-05, + "loss": 0.2754, + "step": 383 + }, + { + "epoch": 0.7903267301260612, + "grad_norm": 0.19615666568279266, + "learning_rate": 1.3424971363115693e-05, + "loss": 0.2811, + "step": 384 + }, + { + "epoch": 0.7923848726524312, + "grad_norm": 0.19037973880767822, + "learning_rate": 1.3402061855670103e-05, + "loss": 0.2673, + "step": 385 + }, + { + "epoch": 0.7944430151788011, + "grad_norm": 0.191124826669693, + "learning_rate": 1.3379152348224514e-05, + "loss": 0.2683, + "step": 386 + }, + { + "epoch": 0.796501157705171, + "grad_norm": 0.18429923057556152, + "learning_rate": 1.3356242840778923e-05, + "loss": 0.2698, + "step": 387 + }, + { + "epoch": 0.798559300231541, + "grad_norm": 0.1839045137166977, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.2895, + "step": 388 + }, + { + "epoch": 0.800617442757911, + "grad_norm": 0.1944131702184677, + "learning_rate": 1.3310423825887744e-05, + "loss": 0.2641, + "step": 389 + }, + { + "epoch": 0.802675585284281, + "grad_norm": 0.20407740771770477, + "learning_rate": 1.3287514318442154e-05, + "loss": 0.2743, + "step": 390 + }, + { + "epoch": 0.8047337278106509, + "grad_norm": 0.1814037561416626, + "learning_rate": 1.3264604810996563e-05, + "loss": 0.2672, + "step": 391 + }, + { + "epoch": 0.8067918703370208, + "grad_norm": 0.1886950582265854, + "learning_rate": 1.3241695303550974e-05, + "loss": 0.2725, + "step": 392 + }, + { + "epoch": 0.8088500128633908, + "grad_norm": 0.19429941475391388, + "learning_rate": 1.3218785796105385e-05, + "loss": 0.2669, + "step": 393 + }, + { + "epoch": 0.8109081553897607, + "grad_norm": 0.19143058359622955, + "learning_rate": 1.3195876288659795e-05, + "loss": 0.2659, + "step": 394 + }, + { + "epoch": 0.8129662979161307, + "grad_norm": 0.2213468849658966, + "learning_rate": 1.3172966781214204e-05, + "loss": 0.2764, + "step": 395 + }, + { + "epoch": 0.8150244404425007, + "grad_norm": 0.2040800005197525, + "learning_rate": 1.3150057273768615e-05, + "loss": 0.2783, + "step": 396 + }, + { + "epoch": 0.8170825829688706, + "grad_norm": 0.1948375254869461, + "learning_rate": 1.3127147766323025e-05, + "loss": 0.2689, + "step": 397 + }, + { + "epoch": 0.8191407254952405, + "grad_norm": 0.1915021538734436, + "learning_rate": 1.3104238258877436e-05, + "loss": 0.2808, + "step": 398 + }, + { + "epoch": 0.8211988680216105, + "grad_norm": 0.19760248064994812, + "learning_rate": 1.3081328751431845e-05, + "loss": 0.2712, + "step": 399 + }, + { + "epoch": 0.8232570105479804, + "grad_norm": 0.2082677185535431, + "learning_rate": 1.3058419243986255e-05, + "loss": 0.2707, + "step": 400 + }, + { + "epoch": 0.8232570105479804, + "eval_loss": 0.28778496384620667, + "eval_runtime": 2427.9096, + "eval_samples_per_second": 3.202, + "eval_steps_per_second": 0.801, + "step": 400 + }, + { + "epoch": 0.8253151530743504, + "grad_norm": 0.19694332778453827, + "learning_rate": 1.3035509736540666e-05, + "loss": 0.2801, + "step": 401 + }, + { + "epoch": 0.8273732956007204, + "grad_norm": 0.19448824226856232, + "learning_rate": 1.3012600229095077e-05, + "loss": 0.2632, + "step": 402 + }, + { + "epoch": 0.8294314381270903, + "grad_norm": 0.18745476007461548, + "learning_rate": 1.2989690721649485e-05, + "loss": 0.2773, + "step": 403 + }, + { + "epoch": 0.8314895806534602, + "grad_norm": 0.19524575769901276, + "learning_rate": 1.2966781214203896e-05, + "loss": 0.2594, + "step": 404 + }, + { + "epoch": 0.8335477231798302, + "grad_norm": 0.19612252712249756, + "learning_rate": 1.2943871706758307e-05, + "loss": 0.271, + "step": 405 + }, + { + "epoch": 0.8356058657062001, + "grad_norm": 0.19964493811130524, + "learning_rate": 1.2920962199312717e-05, + "loss": 0.2615, + "step": 406 + }, + { + "epoch": 0.8376640082325701, + "grad_norm": 0.20115099847316742, + "learning_rate": 1.2898052691867126e-05, + "loss": 0.269, + "step": 407 + }, + { + "epoch": 0.8397221507589401, + "grad_norm": 0.18949687480926514, + "learning_rate": 1.2875143184421537e-05, + "loss": 0.2649, + "step": 408 + }, + { + "epoch": 0.84178029328531, + "grad_norm": 0.1931927353143692, + "learning_rate": 1.2852233676975947e-05, + "loss": 0.2611, + "step": 409 + }, + { + "epoch": 0.8438384358116799, + "grad_norm": 0.18723614513874054, + "learning_rate": 1.2829324169530358e-05, + "loss": 0.2699, + "step": 410 + }, + { + "epoch": 0.8458965783380499, + "grad_norm": 0.19405977427959442, + "learning_rate": 1.2806414662084765e-05, + "loss": 0.2691, + "step": 411 + }, + { + "epoch": 0.8479547208644198, + "grad_norm": 0.2021879404783249, + "learning_rate": 1.2783505154639176e-05, + "loss": 0.267, + "step": 412 + }, + { + "epoch": 0.8500128633907899, + "grad_norm": 0.20015574991703033, + "learning_rate": 1.2760595647193586e-05, + "loss": 0.2632, + "step": 413 + }, + { + "epoch": 0.8520710059171598, + "grad_norm": 0.19090059399604797, + "learning_rate": 1.2737686139747995e-05, + "loss": 0.2743, + "step": 414 + }, + { + "epoch": 0.8541291484435297, + "grad_norm": 0.1906920224428177, + "learning_rate": 1.2714776632302406e-05, + "loss": 0.2723, + "step": 415 + }, + { + "epoch": 0.8561872909698997, + "grad_norm": 0.19348129630088806, + "learning_rate": 1.2691867124856816e-05, + "loss": 0.2656, + "step": 416 + }, + { + "epoch": 0.8582454334962696, + "grad_norm": 0.18771213293075562, + "learning_rate": 1.2668957617411227e-05, + "loss": 0.2617, + "step": 417 + }, + { + "epoch": 0.8603035760226395, + "grad_norm": 0.2135135382413864, + "learning_rate": 1.2646048109965636e-05, + "loss": 0.2773, + "step": 418 + }, + { + "epoch": 0.8623617185490096, + "grad_norm": 0.19689443707466125, + "learning_rate": 1.2623138602520046e-05, + "loss": 0.2623, + "step": 419 + }, + { + "epoch": 0.8644198610753795, + "grad_norm": 0.18752440810203552, + "learning_rate": 1.2600229095074457e-05, + "loss": 0.2599, + "step": 420 + }, + { + "epoch": 0.8664780036017494, + "grad_norm": 0.19264395534992218, + "learning_rate": 1.2577319587628866e-05, + "loss": 0.2707, + "step": 421 + }, + { + "epoch": 0.8685361461281194, + "grad_norm": 0.19980797171592712, + "learning_rate": 1.2554410080183277e-05, + "loss": 0.2616, + "step": 422 + }, + { + "epoch": 0.8705942886544893, + "grad_norm": 0.22940242290496826, + "learning_rate": 1.2531500572737687e-05, + "loss": 0.2712, + "step": 423 + }, + { + "epoch": 0.8726524311808592, + "grad_norm": 0.18825359642505646, + "learning_rate": 1.2508591065292098e-05, + "loss": 0.2779, + "step": 424 + }, + { + "epoch": 0.8747105737072293, + "grad_norm": 0.21553562581539154, + "learning_rate": 1.2485681557846507e-05, + "loss": 0.2677, + "step": 425 + }, + { + "epoch": 0.8767687162335992, + "grad_norm": 0.2025568038225174, + "learning_rate": 1.2462772050400917e-05, + "loss": 0.2659, + "step": 426 + }, + { + "epoch": 0.8788268587599691, + "grad_norm": 0.19179950654506683, + "learning_rate": 1.2439862542955328e-05, + "loss": 0.2762, + "step": 427 + }, + { + "epoch": 0.8808850012863391, + "grad_norm": 0.20982210338115692, + "learning_rate": 1.2416953035509738e-05, + "loss": 0.2648, + "step": 428 + }, + { + "epoch": 0.882943143812709, + "grad_norm": 0.2084280252456665, + "learning_rate": 1.2394043528064147e-05, + "loss": 0.2806, + "step": 429 + }, + { + "epoch": 0.8850012863390789, + "grad_norm": 0.1993308663368225, + "learning_rate": 1.2371134020618558e-05, + "loss": 0.2673, + "step": 430 + }, + { + "epoch": 0.887059428865449, + "grad_norm": 0.1917535811662674, + "learning_rate": 1.2348224513172968e-05, + "loss": 0.2596, + "step": 431 + }, + { + "epoch": 0.8891175713918189, + "grad_norm": 0.18980742990970612, + "learning_rate": 1.2325315005727379e-05, + "loss": 0.2607, + "step": 432 + }, + { + "epoch": 0.8911757139181888, + "grad_norm": 0.21062685549259186, + "learning_rate": 1.2302405498281788e-05, + "loss": 0.2612, + "step": 433 + }, + { + "epoch": 0.8932338564445588, + "grad_norm": 0.20591405034065247, + "learning_rate": 1.2279495990836199e-05, + "loss": 0.2698, + "step": 434 + }, + { + "epoch": 0.8952919989709287, + "grad_norm": 0.2052398920059204, + "learning_rate": 1.2256586483390609e-05, + "loss": 0.2673, + "step": 435 + }, + { + "epoch": 0.8973501414972986, + "grad_norm": 0.19963452219963074, + "learning_rate": 1.223367697594502e-05, + "loss": 0.266, + "step": 436 + }, + { + "epoch": 0.8994082840236687, + "grad_norm": 0.1929163783788681, + "learning_rate": 1.2210767468499429e-05, + "loss": 0.2605, + "step": 437 + }, + { + "epoch": 0.9014664265500386, + "grad_norm": 0.19121681153774261, + "learning_rate": 1.218785796105384e-05, + "loss": 0.2642, + "step": 438 + }, + { + "epoch": 0.9035245690764085, + "grad_norm": 0.18931221961975098, + "learning_rate": 1.2164948453608248e-05, + "loss": 0.2653, + "step": 439 + }, + { + "epoch": 0.9055827116027785, + "grad_norm": 0.21359370648860931, + "learning_rate": 1.2142038946162657e-05, + "loss": 0.264, + "step": 440 + }, + { + "epoch": 0.9076408541291484, + "grad_norm": 0.1874193251132965, + "learning_rate": 1.2119129438717068e-05, + "loss": 0.2664, + "step": 441 + }, + { + "epoch": 0.9096989966555183, + "grad_norm": 0.19697226583957672, + "learning_rate": 1.2096219931271478e-05, + "loss": 0.2651, + "step": 442 + }, + { + "epoch": 0.9117571391818884, + "grad_norm": 0.20930957794189453, + "learning_rate": 1.2073310423825889e-05, + "loss": 0.2724, + "step": 443 + }, + { + "epoch": 0.9138152817082583, + "grad_norm": 0.19588977098464966, + "learning_rate": 1.2050400916380298e-05, + "loss": 0.2648, + "step": 444 + }, + { + "epoch": 0.9158734242346283, + "grad_norm": 0.19452017545700073, + "learning_rate": 1.2027491408934708e-05, + "loss": 0.2808, + "step": 445 + }, + { + "epoch": 0.9179315667609982, + "grad_norm": 0.19226408004760742, + "learning_rate": 1.2004581901489119e-05, + "loss": 0.2627, + "step": 446 + }, + { + "epoch": 0.9199897092873681, + "grad_norm": 0.18108274042606354, + "learning_rate": 1.198167239404353e-05, + "loss": 0.2693, + "step": 447 + }, + { + "epoch": 0.922047851813738, + "grad_norm": 0.19352363049983978, + "learning_rate": 1.1958762886597938e-05, + "loss": 0.2705, + "step": 448 + }, + { + "epoch": 0.9241059943401081, + "grad_norm": 0.18535122275352478, + "learning_rate": 1.1935853379152349e-05, + "loss": 0.2608, + "step": 449 + }, + { + "epoch": 0.926164136866478, + "grad_norm": 0.19209617376327515, + "learning_rate": 1.191294387170676e-05, + "loss": 0.2702, + "step": 450 + }, + { + "epoch": 0.928222279392848, + "grad_norm": 0.1866796910762787, + "learning_rate": 1.189003436426117e-05, + "loss": 0.264, + "step": 451 + }, + { + "epoch": 0.9302804219192179, + "grad_norm": 0.21708665788173676, + "learning_rate": 1.1867124856815579e-05, + "loss": 0.2693, + "step": 452 + }, + { + "epoch": 0.9323385644455878, + "grad_norm": 0.19297796487808228, + "learning_rate": 1.184421534936999e-05, + "loss": 0.2745, + "step": 453 + }, + { + "epoch": 0.9343967069719578, + "grad_norm": 0.19070400297641754, + "learning_rate": 1.18213058419244e-05, + "loss": 0.265, + "step": 454 + }, + { + "epoch": 0.9364548494983278, + "grad_norm": 0.19821566343307495, + "learning_rate": 1.1798396334478809e-05, + "loss": 0.2674, + "step": 455 + }, + { + "epoch": 0.9385129920246977, + "grad_norm": 0.2032192200422287, + "learning_rate": 1.177548682703322e-05, + "loss": 0.276, + "step": 456 + }, + { + "epoch": 0.9405711345510677, + "grad_norm": 0.19127750396728516, + "learning_rate": 1.175257731958763e-05, + "loss": 0.2696, + "step": 457 + }, + { + "epoch": 0.9426292770774376, + "grad_norm": 0.19187286496162415, + "learning_rate": 1.1729667812142041e-05, + "loss": 0.2601, + "step": 458 + }, + { + "epoch": 0.9446874196038075, + "grad_norm": 0.20871371030807495, + "learning_rate": 1.170675830469645e-05, + "loss": 0.2687, + "step": 459 + }, + { + "epoch": 0.9467455621301775, + "grad_norm": 0.19228306412696838, + "learning_rate": 1.168384879725086e-05, + "loss": 0.2633, + "step": 460 + }, + { + "epoch": 0.9488037046565475, + "grad_norm": 0.19025444984436035, + "learning_rate": 1.1660939289805271e-05, + "loss": 0.2721, + "step": 461 + }, + { + "epoch": 0.9508618471829174, + "grad_norm": 0.19476914405822754, + "learning_rate": 1.1638029782359682e-05, + "loss": 0.2662, + "step": 462 + }, + { + "epoch": 0.9529199897092874, + "grad_norm": 0.1991666853427887, + "learning_rate": 1.161512027491409e-05, + "loss": 0.269, + "step": 463 + }, + { + "epoch": 0.9549781322356573, + "grad_norm": 0.19385920464992523, + "learning_rate": 1.1592210767468501e-05, + "loss": 0.2647, + "step": 464 + }, + { + "epoch": 0.9570362747620272, + "grad_norm": 0.1911603957414627, + "learning_rate": 1.1569301260022912e-05, + "loss": 0.2679, + "step": 465 + }, + { + "epoch": 0.9590944172883972, + "grad_norm": 0.20373377203941345, + "learning_rate": 1.1546391752577319e-05, + "loss": 0.2694, + "step": 466 + }, + { + "epoch": 0.9611525598147672, + "grad_norm": 0.20550350844860077, + "learning_rate": 1.152348224513173e-05, + "loss": 0.2677, + "step": 467 + }, + { + "epoch": 0.9632107023411371, + "grad_norm": 0.2049354463815689, + "learning_rate": 1.150057273768614e-05, + "loss": 0.2752, + "step": 468 + }, + { + "epoch": 0.9652688448675071, + "grad_norm": 0.21691595017910004, + "learning_rate": 1.147766323024055e-05, + "loss": 0.2727, + "step": 469 + }, + { + "epoch": 0.967326987393877, + "grad_norm": 0.20727306604385376, + "learning_rate": 1.145475372279496e-05, + "loss": 0.2575, + "step": 470 + }, + { + "epoch": 0.969385129920247, + "grad_norm": 0.19166423380374908, + "learning_rate": 1.143184421534937e-05, + "loss": 0.2716, + "step": 471 + }, + { + "epoch": 0.9714432724466169, + "grad_norm": 0.18833886086940765, + "learning_rate": 1.140893470790378e-05, + "loss": 0.2651, + "step": 472 + }, + { + "epoch": 0.9735014149729869, + "grad_norm": 0.19680088758468628, + "learning_rate": 1.1386025200458191e-05, + "loss": 0.2621, + "step": 473 + }, + { + "epoch": 0.9755595574993569, + "grad_norm": 0.20966476202011108, + "learning_rate": 1.13631156930126e-05, + "loss": 0.2725, + "step": 474 + }, + { + "epoch": 0.9776177000257268, + "grad_norm": 0.1963450163602829, + "learning_rate": 1.134020618556701e-05, + "loss": 0.2569, + "step": 475 + }, + { + "epoch": 0.9796758425520967, + "grad_norm": 0.21289944648742676, + "learning_rate": 1.1317296678121421e-05, + "loss": 0.2622, + "step": 476 + }, + { + "epoch": 0.9817339850784667, + "grad_norm": 0.2103341966867447, + "learning_rate": 1.1294387170675832e-05, + "loss": 0.2803, + "step": 477 + }, + { + "epoch": 0.9837921276048366, + "grad_norm": 0.20202945172786713, + "learning_rate": 1.1271477663230241e-05, + "loss": 0.273, + "step": 478 + }, + { + "epoch": 0.9858502701312066, + "grad_norm": 0.18241006135940552, + "learning_rate": 1.1248568155784651e-05, + "loss": 0.2721, + "step": 479 + }, + { + "epoch": 0.9879084126575766, + "grad_norm": 0.19221259653568268, + "learning_rate": 1.1225658648339062e-05, + "loss": 0.2646, + "step": 480 + }, + { + "epoch": 0.9899665551839465, + "grad_norm": 0.19371837377548218, + "learning_rate": 1.1202749140893473e-05, + "loss": 0.2519, + "step": 481 + }, + { + "epoch": 0.9920246977103164, + "grad_norm": 0.1972094029188156, + "learning_rate": 1.1179839633447882e-05, + "loss": 0.2555, + "step": 482 + }, + { + "epoch": 0.9940828402366864, + "grad_norm": 0.19414126873016357, + "learning_rate": 1.1156930126002292e-05, + "loss": 0.2726, + "step": 483 + }, + { + "epoch": 0.9961409827630563, + "grad_norm": 0.18993492424488068, + "learning_rate": 1.1134020618556703e-05, + "loss": 0.2644, + "step": 484 + }, + { + "epoch": 0.9981991252894263, + "grad_norm": 0.19713927805423737, + "learning_rate": 1.1111111111111113e-05, + "loss": 0.2569, + "step": 485 + }, + { + "epoch": 1.00205814252637, + "grad_norm": 0.3423589766025543, + "learning_rate": 1.1088201603665522e-05, + "loss": 0.5285, + "step": 486 + }, + { + "epoch": 1.0041162850527399, + "grad_norm": 0.1901763528585434, + "learning_rate": 1.1065292096219933e-05, + "loss": 0.2621, + "step": 487 + }, + { + "epoch": 1.0061744275791098, + "grad_norm": 0.20508776605129242, + "learning_rate": 1.1042382588774343e-05, + "loss": 0.2665, + "step": 488 + }, + { + "epoch": 1.0082325701054797, + "grad_norm": 0.20188146829605103, + "learning_rate": 1.1019473081328752e-05, + "loss": 0.2547, + "step": 489 + }, + { + "epoch": 1.0102907126318497, + "grad_norm": 0.20245613157749176, + "learning_rate": 1.0996563573883163e-05, + "loss": 0.2657, + "step": 490 + }, + { + "epoch": 1.0123488551582196, + "grad_norm": 0.19711382687091827, + "learning_rate": 1.0973654066437574e-05, + "loss": 0.2597, + "step": 491 + }, + { + "epoch": 1.0144069976845898, + "grad_norm": 0.21538953483104706, + "learning_rate": 1.0950744558991984e-05, + "loss": 0.2727, + "step": 492 + }, + { + "epoch": 1.0164651402109597, + "grad_norm": 0.20296984910964966, + "learning_rate": 1.0927835051546391e-05, + "loss": 0.2634, + "step": 493 + }, + { + "epoch": 1.0185232827373296, + "grad_norm": 0.20134592056274414, + "learning_rate": 1.0904925544100802e-05, + "loss": 0.2596, + "step": 494 + }, + { + "epoch": 1.0205814252636995, + "grad_norm": 0.200101837515831, + "learning_rate": 1.0882016036655212e-05, + "loss": 0.2575, + "step": 495 + }, + { + "epoch": 1.0226395677900695, + "grad_norm": 0.19144928455352783, + "learning_rate": 1.0859106529209621e-05, + "loss": 0.263, + "step": 496 + }, + { + "epoch": 1.0246977103164394, + "grad_norm": 0.19832482933998108, + "learning_rate": 1.0836197021764032e-05, + "loss": 0.2656, + "step": 497 + }, + { + "epoch": 1.0267558528428093, + "grad_norm": 0.20965202152729034, + "learning_rate": 1.0813287514318443e-05, + "loss": 0.2611, + "step": 498 + }, + { + "epoch": 1.0288139953691793, + "grad_norm": 0.1974337100982666, + "learning_rate": 1.0790378006872853e-05, + "loss": 0.2667, + "step": 499 + }, + { + "epoch": 1.0308721378955492, + "grad_norm": 0.20611713826656342, + "learning_rate": 1.0767468499427262e-05, + "loss": 0.2674, + "step": 500 + }, + { + "epoch": 1.0308721378955492, + "eval_loss": 0.2836935222148895, + "eval_runtime": 2423.44, + "eval_samples_per_second": 3.208, + "eval_steps_per_second": 0.802, + "step": 500 + }, + { + "epoch": 1.0329302804219191, + "grad_norm": 0.202958345413208, + "learning_rate": 1.0744558991981673e-05, + "loss": 0.2684, + "step": 501 + }, + { + "epoch": 1.034988422948289, + "grad_norm": 0.1984429508447647, + "learning_rate": 1.0721649484536083e-05, + "loss": 0.2557, + "step": 502 + }, + { + "epoch": 1.0370465654746592, + "grad_norm": 0.19396482408046722, + "learning_rate": 1.0698739977090494e-05, + "loss": 0.255, + "step": 503 + }, + { + "epoch": 1.0391047080010292, + "grad_norm": 0.19176840782165527, + "learning_rate": 1.0675830469644903e-05, + "loss": 0.2675, + "step": 504 + }, + { + "epoch": 1.041162850527399, + "grad_norm": 0.20167966187000275, + "learning_rate": 1.0652920962199313e-05, + "loss": 0.2669, + "step": 505 + }, + { + "epoch": 1.043220993053769, + "grad_norm": 0.2049783617258072, + "learning_rate": 1.0630011454753724e-05, + "loss": 0.2446, + "step": 506 + }, + { + "epoch": 1.045279135580139, + "grad_norm": 0.19293472170829773, + "learning_rate": 1.0607101947308135e-05, + "loss": 0.256, + "step": 507 + }, + { + "epoch": 1.047337278106509, + "grad_norm": 0.19432370364665985, + "learning_rate": 1.0584192439862543e-05, + "loss": 0.2605, + "step": 508 + }, + { + "epoch": 1.0493954206328788, + "grad_norm": 0.19784876704216003, + "learning_rate": 1.0561282932416954e-05, + "loss": 0.2617, + "step": 509 + }, + { + "epoch": 1.0514535631592488, + "grad_norm": 0.19982090592384338, + "learning_rate": 1.0538373424971365e-05, + "loss": 0.264, + "step": 510 + }, + { + "epoch": 1.0535117056856187, + "grad_norm": 0.2019587755203247, + "learning_rate": 1.0515463917525775e-05, + "loss": 0.2543, + "step": 511 + }, + { + "epoch": 1.0555698482119886, + "grad_norm": 0.19848807156085968, + "learning_rate": 1.0492554410080184e-05, + "loss": 0.2613, + "step": 512 + }, + { + "epoch": 1.0576279907383586, + "grad_norm": 0.20360374450683594, + "learning_rate": 1.0469644902634595e-05, + "loss": 0.2675, + "step": 513 + }, + { + "epoch": 1.0596861332647285, + "grad_norm": 0.19209840893745422, + "learning_rate": 1.0446735395189005e-05, + "loss": 0.2517, + "step": 514 + }, + { + "epoch": 1.0617442757910984, + "grad_norm": 0.19142381846904755, + "learning_rate": 1.0423825887743416e-05, + "loss": 0.2631, + "step": 515 + }, + { + "epoch": 1.0638024183174686, + "grad_norm": 0.20222575962543488, + "learning_rate": 1.0400916380297825e-05, + "loss": 0.2625, + "step": 516 + }, + { + "epoch": 1.0658605608438385, + "grad_norm": 0.1984448879957199, + "learning_rate": 1.0378006872852235e-05, + "loss": 0.2584, + "step": 517 + }, + { + "epoch": 1.0679187033702084, + "grad_norm": 0.1992885023355484, + "learning_rate": 1.0355097365406646e-05, + "loss": 0.2609, + "step": 518 + }, + { + "epoch": 1.0699768458965784, + "grad_norm": 0.20708978176116943, + "learning_rate": 1.0332187857961057e-05, + "loss": 0.2618, + "step": 519 + }, + { + "epoch": 1.0720349884229483, + "grad_norm": 0.22806766629219055, + "learning_rate": 1.0309278350515464e-05, + "loss": 0.2634, + "step": 520 + }, + { + "epoch": 1.0740931309493182, + "grad_norm": 0.2019941806793213, + "learning_rate": 1.0286368843069874e-05, + "loss": 0.2588, + "step": 521 + }, + { + "epoch": 1.0761512734756882, + "grad_norm": 0.19460470974445343, + "learning_rate": 1.0263459335624283e-05, + "loss": 0.2692, + "step": 522 + }, + { + "epoch": 1.078209416002058, + "grad_norm": 0.19483187794685364, + "learning_rate": 1.0240549828178694e-05, + "loss": 0.2474, + "step": 523 + }, + { + "epoch": 1.080267558528428, + "grad_norm": 0.2199576050043106, + "learning_rate": 1.0217640320733104e-05, + "loss": 0.2582, + "step": 524 + }, + { + "epoch": 1.082325701054798, + "grad_norm": 0.20485302805900574, + "learning_rate": 1.0194730813287515e-05, + "loss": 0.2463, + "step": 525 + }, + { + "epoch": 1.084383843581168, + "grad_norm": 0.20773454010486603, + "learning_rate": 1.0171821305841924e-05, + "loss": 0.2501, + "step": 526 + }, + { + "epoch": 1.086441986107538, + "grad_norm": 0.19593262672424316, + "learning_rate": 1.0148911798396335e-05, + "loss": 0.2608, + "step": 527 + }, + { + "epoch": 1.088500128633908, + "grad_norm": 0.20500554144382477, + "learning_rate": 1.0126002290950745e-05, + "loss": 0.2586, + "step": 528 + }, + { + "epoch": 1.090558271160278, + "grad_norm": 0.19919747114181519, + "learning_rate": 1.0103092783505156e-05, + "loss": 0.2724, + "step": 529 + }, + { + "epoch": 1.0926164136866479, + "grad_norm": 0.1953326314687729, + "learning_rate": 1.0080183276059565e-05, + "loss": 0.2456, + "step": 530 + }, + { + "epoch": 1.0946745562130178, + "grad_norm": 0.2155047059059143, + "learning_rate": 1.0057273768613975e-05, + "loss": 0.2644, + "step": 531 + }, + { + "epoch": 1.0967326987393877, + "grad_norm": 0.19747495651245117, + "learning_rate": 1.0034364261168386e-05, + "loss": 0.2539, + "step": 532 + }, + { + "epoch": 1.0987908412657577, + "grad_norm": 0.20261652767658234, + "learning_rate": 1.0011454753722796e-05, + "loss": 0.255, + "step": 533 + }, + { + "epoch": 1.1008489837921276, + "grad_norm": 0.19529719650745392, + "learning_rate": 9.988545246277205e-06, + "loss": 0.2489, + "step": 534 + }, + { + "epoch": 1.1029071263184975, + "grad_norm": 0.20239490270614624, + "learning_rate": 9.965635738831616e-06, + "loss": 0.2664, + "step": 535 + }, + { + "epoch": 1.1049652688448675, + "grad_norm": 0.19377024471759796, + "learning_rate": 9.942726231386026e-06, + "loss": 0.2615, + "step": 536 + }, + { + "epoch": 1.1070234113712374, + "grad_norm": 0.20523156225681305, + "learning_rate": 9.919816723940437e-06, + "loss": 0.2548, + "step": 537 + }, + { + "epoch": 1.1090815538976073, + "grad_norm": 0.2046228051185608, + "learning_rate": 9.896907216494846e-06, + "loss": 0.2704, + "step": 538 + }, + { + "epoch": 1.1111396964239773, + "grad_norm": 0.21209484338760376, + "learning_rate": 9.873997709049257e-06, + "loss": 0.2637, + "step": 539 + }, + { + "epoch": 1.1131978389503474, + "grad_norm": 0.20251420140266418, + "learning_rate": 9.851088201603667e-06, + "loss": 0.2617, + "step": 540 + }, + { + "epoch": 1.1152559814767173, + "grad_norm": 0.21695846319198608, + "learning_rate": 9.828178694158076e-06, + "loss": 0.2658, + "step": 541 + }, + { + "epoch": 1.1173141240030873, + "grad_norm": 0.2015303075313568, + "learning_rate": 9.805269186712487e-06, + "loss": 0.2528, + "step": 542 + }, + { + "epoch": 1.1193722665294572, + "grad_norm": 0.21796390414237976, + "learning_rate": 9.782359679266896e-06, + "loss": 0.2625, + "step": 543 + }, + { + "epoch": 1.1214304090558271, + "grad_norm": 0.20676304399967194, + "learning_rate": 9.759450171821306e-06, + "loss": 0.268, + "step": 544 + }, + { + "epoch": 1.123488551582197, + "grad_norm": 0.1986500769853592, + "learning_rate": 9.736540664375717e-06, + "loss": 0.2546, + "step": 545 + }, + { + "epoch": 1.125546694108567, + "grad_norm": 0.20008589327335358, + "learning_rate": 9.713631156930127e-06, + "loss": 0.2525, + "step": 546 + }, + { + "epoch": 1.127604836634937, + "grad_norm": 0.1891598105430603, + "learning_rate": 9.690721649484536e-06, + "loss": 0.256, + "step": 547 + }, + { + "epoch": 1.1296629791613069, + "grad_norm": 0.20968230068683624, + "learning_rate": 9.667812142038947e-06, + "loss": 0.2495, + "step": 548 + }, + { + "epoch": 1.1317211216876768, + "grad_norm": 0.2025834023952484, + "learning_rate": 9.644902634593357e-06, + "loss": 0.2533, + "step": 549 + }, + { + "epoch": 1.1337792642140467, + "grad_norm": 0.21087367832660675, + "learning_rate": 9.621993127147768e-06, + "loss": 0.2518, + "step": 550 + }, + { + "epoch": 1.1358374067404169, + "grad_norm": 0.20784996449947357, + "learning_rate": 9.599083619702177e-06, + "loss": 0.2594, + "step": 551 + }, + { + "epoch": 1.1378955492667868, + "grad_norm": 0.20754118263721466, + "learning_rate": 9.576174112256587e-06, + "loss": 0.2515, + "step": 552 + }, + { + "epoch": 1.1399536917931568, + "grad_norm": 0.225090891122818, + "learning_rate": 9.553264604810998e-06, + "loss": 0.2615, + "step": 553 + }, + { + "epoch": 1.1420118343195267, + "grad_norm": 0.24656590819358826, + "learning_rate": 9.530355097365407e-06, + "loss": 0.2636, + "step": 554 + }, + { + "epoch": 1.1440699768458966, + "grad_norm": 0.22454337775707245, + "learning_rate": 9.507445589919818e-06, + "loss": 0.2584, + "step": 555 + }, + { + "epoch": 1.1461281193722666, + "grad_norm": 0.2229425013065338, + "learning_rate": 9.484536082474226e-06, + "loss": 0.2543, + "step": 556 + }, + { + "epoch": 1.1481862618986365, + "grad_norm": 0.18805071711540222, + "learning_rate": 9.461626575028637e-06, + "loss": 0.2593, + "step": 557 + }, + { + "epoch": 1.1502444044250064, + "grad_norm": 0.23163346946239471, + "learning_rate": 9.438717067583048e-06, + "loss": 0.2537, + "step": 558 + }, + { + "epoch": 1.1523025469513763, + "grad_norm": 0.2126983255147934, + "learning_rate": 9.415807560137458e-06, + "loss": 0.2598, + "step": 559 + }, + { + "epoch": 1.1543606894777463, + "grad_norm": 0.2113332748413086, + "learning_rate": 9.392898052691867e-06, + "loss": 0.2617, + "step": 560 + }, + { + "epoch": 1.1564188320041162, + "grad_norm": 0.2220505177974701, + "learning_rate": 9.369988545246278e-06, + "loss": 0.2673, + "step": 561 + }, + { + "epoch": 1.1584769745304861, + "grad_norm": 0.21683354675769806, + "learning_rate": 9.347079037800688e-06, + "loss": 0.259, + "step": 562 + }, + { + "epoch": 1.160535117056856, + "grad_norm": 0.20226940512657166, + "learning_rate": 9.324169530355099e-06, + "loss": 0.2536, + "step": 563 + }, + { + "epoch": 1.1625932595832262, + "grad_norm": 0.2166106402873993, + "learning_rate": 9.301260022909508e-06, + "loss": 0.2573, + "step": 564 + }, + { + "epoch": 1.1646514021095962, + "grad_norm": 0.21802830696105957, + "learning_rate": 9.278350515463918e-06, + "loss": 0.2604, + "step": 565 + }, + { + "epoch": 1.166709544635966, + "grad_norm": 0.19723279774188995, + "learning_rate": 9.255441008018329e-06, + "loss": 0.2643, + "step": 566 + }, + { + "epoch": 1.168767687162336, + "grad_norm": 0.20100893080234528, + "learning_rate": 9.23253150057274e-06, + "loss": 0.2601, + "step": 567 + }, + { + "epoch": 1.170825829688706, + "grad_norm": 0.19834032654762268, + "learning_rate": 9.209621993127148e-06, + "loss": 0.2624, + "step": 568 + }, + { + "epoch": 1.172883972215076, + "grad_norm": 0.20677493512630463, + "learning_rate": 9.186712485681557e-06, + "loss": 0.2527, + "step": 569 + }, + { + "epoch": 1.1749421147414458, + "grad_norm": 0.20895297825336456, + "learning_rate": 9.163802978235968e-06, + "loss": 0.2519, + "step": 570 + }, + { + "epoch": 1.1770002572678158, + "grad_norm": 0.19748030602931976, + "learning_rate": 9.140893470790379e-06, + "loss": 0.2567, + "step": 571 + }, + { + "epoch": 1.1790583997941857, + "grad_norm": 0.20713521540164948, + "learning_rate": 9.117983963344789e-06, + "loss": 0.2771, + "step": 572 + }, + { + "epoch": 1.1811165423205556, + "grad_norm": 0.2146754264831543, + "learning_rate": 9.095074455899198e-06, + "loss": 0.2537, + "step": 573 + }, + { + "epoch": 1.1831746848469256, + "grad_norm": 0.20723004639148712, + "learning_rate": 9.072164948453609e-06, + "loss": 0.253, + "step": 574 + }, + { + "epoch": 1.1852328273732957, + "grad_norm": 0.2072172611951828, + "learning_rate": 9.04925544100802e-06, + "loss": 0.2545, + "step": 575 + }, + { + "epoch": 1.1872909698996654, + "grad_norm": 0.20537281036376953, + "learning_rate": 9.02634593356243e-06, + "loss": 0.2517, + "step": 576 + }, + { + "epoch": 1.1893491124260356, + "grad_norm": 0.21034401655197144, + "learning_rate": 9.003436426116839e-06, + "loss": 0.2506, + "step": 577 + }, + { + "epoch": 1.1914072549524055, + "grad_norm": 0.21373845636844635, + "learning_rate": 8.98052691867125e-06, + "loss": 0.2544, + "step": 578 + }, + { + "epoch": 1.1934653974787754, + "grad_norm": 0.22282572090625763, + "learning_rate": 8.95761741122566e-06, + "loss": 0.2607, + "step": 579 + }, + { + "epoch": 1.1955235400051454, + "grad_norm": 0.20421402156352997, + "learning_rate": 8.93470790378007e-06, + "loss": 0.2636, + "step": 580 + }, + { + "epoch": 1.1975816825315153, + "grad_norm": 0.2095903605222702, + "learning_rate": 8.91179839633448e-06, + "loss": 0.2627, + "step": 581 + }, + { + "epoch": 1.1996398250578852, + "grad_norm": 0.2215132862329483, + "learning_rate": 8.888888888888888e-06, + "loss": 0.2651, + "step": 582 + }, + { + "epoch": 1.2016979675842552, + "grad_norm": 0.22536343336105347, + "learning_rate": 8.865979381443299e-06, + "loss": 0.2548, + "step": 583 + }, + { + "epoch": 1.203756110110625, + "grad_norm": 0.19969668984413147, + "learning_rate": 8.84306987399771e-06, + "loss": 0.2646, + "step": 584 + }, + { + "epoch": 1.205814252636995, + "grad_norm": 0.225993350148201, + "learning_rate": 8.82016036655212e-06, + "loss": 0.2607, + "step": 585 + }, + { + "epoch": 1.207872395163365, + "grad_norm": 0.19197311997413635, + "learning_rate": 8.797250859106529e-06, + "loss": 0.2519, + "step": 586 + }, + { + "epoch": 1.209930537689735, + "grad_norm": 0.1974429190158844, + "learning_rate": 8.77434135166094e-06, + "loss": 0.2512, + "step": 587 + }, + { + "epoch": 1.211988680216105, + "grad_norm": 0.19816122949123383, + "learning_rate": 8.75143184421535e-06, + "loss": 0.2582, + "step": 588 + }, + { + "epoch": 1.214046822742475, + "grad_norm": 0.20259711146354675, + "learning_rate": 8.72852233676976e-06, + "loss": 0.2561, + "step": 589 + }, + { + "epoch": 1.216104965268845, + "grad_norm": 0.23857274651527405, + "learning_rate": 8.70561282932417e-06, + "loss": 0.2574, + "step": 590 + }, + { + "epoch": 1.2181631077952149, + "grad_norm": 0.2108597606420517, + "learning_rate": 8.68270332187858e-06, + "loss": 0.2546, + "step": 591 + }, + { + "epoch": 1.2202212503215848, + "grad_norm": 0.20933857560157776, + "learning_rate": 8.65979381443299e-06, + "loss": 0.2527, + "step": 592 + }, + { + "epoch": 1.2222793928479547, + "grad_norm": 0.19276075065135956, + "learning_rate": 8.636884306987401e-06, + "loss": 0.26, + "step": 593 + }, + { + "epoch": 1.2243375353743247, + "grad_norm": 0.2111658900976181, + "learning_rate": 8.61397479954181e-06, + "loss": 0.267, + "step": 594 + }, + { + "epoch": 1.2263956779006946, + "grad_norm": 0.20039953291416168, + "learning_rate": 8.591065292096221e-06, + "loss": 0.2454, + "step": 595 + }, + { + "epoch": 1.2284538204270645, + "grad_norm": 0.212934210896492, + "learning_rate": 8.56815578465063e-06, + "loss": 0.2674, + "step": 596 + }, + { + "epoch": 1.2305119629534345, + "grad_norm": 0.2036072462797165, + "learning_rate": 8.54524627720504e-06, + "loss": 0.2613, + "step": 597 + }, + { + "epoch": 1.2325701054798044, + "grad_norm": 0.20735019445419312, + "learning_rate": 8.522336769759451e-06, + "loss": 0.2648, + "step": 598 + }, + { + "epoch": 1.2346282480061745, + "grad_norm": 0.2097824215888977, + "learning_rate": 8.49942726231386e-06, + "loss": 0.2535, + "step": 599 + }, + { + "epoch": 1.2366863905325443, + "grad_norm": 0.19988034665584564, + "learning_rate": 8.47651775486827e-06, + "loss": 0.2507, + "step": 600 + }, + { + "epoch": 1.2366863905325443, + "eval_loss": 0.28046268224716187, + "eval_runtime": 2441.2385, + "eval_samples_per_second": 3.184, + "eval_steps_per_second": 0.796, + "step": 600 + }, + { + "epoch": 1.2387445330589144, + "grad_norm": 0.20321473479270935, + "learning_rate": 8.453608247422681e-06, + "loss": 0.2588, + "step": 601 + }, + { + "epoch": 1.2408026755852843, + "grad_norm": 0.20362116396427155, + "learning_rate": 8.430698739977092e-06, + "loss": 0.2608, + "step": 602 + }, + { + "epoch": 1.2428608181116543, + "grad_norm": 0.20123381912708282, + "learning_rate": 8.4077892325315e-06, + "loss": 0.2527, + "step": 603 + }, + { + "epoch": 1.2449189606380242, + "grad_norm": 0.2133895605802536, + "learning_rate": 8.384879725085911e-06, + "loss": 0.2731, + "step": 604 + }, + { + "epoch": 1.2469771031643941, + "grad_norm": 0.5265193581581116, + "learning_rate": 8.361970217640322e-06, + "loss": 0.2498, + "step": 605 + }, + { + "epoch": 1.249035245690764, + "grad_norm": 0.2142847776412964, + "learning_rate": 8.339060710194732e-06, + "loss": 0.268, + "step": 606 + }, + { + "epoch": 1.251093388217134, + "grad_norm": 0.19556185603141785, + "learning_rate": 8.316151202749141e-06, + "loss": 0.2587, + "step": 607 + }, + { + "epoch": 1.253151530743504, + "grad_norm": 0.20104384422302246, + "learning_rate": 8.293241695303552e-06, + "loss": 0.248, + "step": 608 + }, + { + "epoch": 1.2552096732698739, + "grad_norm": 0.20386339724063873, + "learning_rate": 8.27033218785796e-06, + "loss": 0.2564, + "step": 609 + }, + { + "epoch": 1.257267815796244, + "grad_norm": 0.21464361250400543, + "learning_rate": 8.247422680412371e-06, + "loss": 0.2651, + "step": 610 + }, + { + "epoch": 1.2593259583226137, + "grad_norm": 0.20295380055904388, + "learning_rate": 8.224513172966782e-06, + "loss": 0.249, + "step": 611 + }, + { + "epoch": 1.261384100848984, + "grad_norm": 0.19431617856025696, + "learning_rate": 8.201603665521193e-06, + "loss": 0.2487, + "step": 612 + }, + { + "epoch": 1.2634422433753538, + "grad_norm": 0.20218072831630707, + "learning_rate": 8.178694158075601e-06, + "loss": 0.2609, + "step": 613 + }, + { + "epoch": 1.2655003859017238, + "grad_norm": 0.20500090718269348, + "learning_rate": 8.155784650630012e-06, + "loss": 0.2705, + "step": 614 + }, + { + "epoch": 1.2675585284280937, + "grad_norm": 0.20803052186965942, + "learning_rate": 8.132875143184423e-06, + "loss": 0.2525, + "step": 615 + }, + { + "epoch": 1.2696166709544636, + "grad_norm": 0.2087874561548233, + "learning_rate": 8.109965635738832e-06, + "loss": 0.2541, + "step": 616 + }, + { + "epoch": 1.2716748134808336, + "grad_norm": 0.2055324912071228, + "learning_rate": 8.087056128293242e-06, + "loss": 0.2647, + "step": 617 + }, + { + "epoch": 1.2737329560072035, + "grad_norm": 0.20352068543434143, + "learning_rate": 8.064146620847653e-06, + "loss": 0.2666, + "step": 618 + }, + { + "epoch": 1.2757910985335734, + "grad_norm": 0.20651914179325104, + "learning_rate": 8.041237113402063e-06, + "loss": 0.2525, + "step": 619 + }, + { + "epoch": 1.2778492410599434, + "grad_norm": 0.2097817212343216, + "learning_rate": 8.018327605956472e-06, + "loss": 0.2576, + "step": 620 + }, + { + "epoch": 1.2799073835863133, + "grad_norm": 0.20695503056049347, + "learning_rate": 7.995418098510883e-06, + "loss": 0.2633, + "step": 621 + }, + { + "epoch": 1.2819655261126832, + "grad_norm": 0.20550110936164856, + "learning_rate": 7.972508591065293e-06, + "loss": 0.2629, + "step": 622 + }, + { + "epoch": 1.2840236686390534, + "grad_norm": 0.2035083919763565, + "learning_rate": 7.949599083619702e-06, + "loss": 0.2566, + "step": 623 + }, + { + "epoch": 1.286081811165423, + "grad_norm": 0.21426044404506683, + "learning_rate": 7.926689576174113e-06, + "loss": 0.2636, + "step": 624 + }, + { + "epoch": 1.2881399536917932, + "grad_norm": 0.20519520342350006, + "learning_rate": 7.903780068728523e-06, + "loss": 0.2665, + "step": 625 + }, + { + "epoch": 1.2901980962181632, + "grad_norm": 0.2012549638748169, + "learning_rate": 7.880870561282932e-06, + "loss": 0.2588, + "step": 626 + }, + { + "epoch": 1.292256238744533, + "grad_norm": 0.19951675832271576, + "learning_rate": 7.857961053837343e-06, + "loss": 0.2592, + "step": 627 + }, + { + "epoch": 1.294314381270903, + "grad_norm": 0.21163856983184814, + "learning_rate": 7.835051546391754e-06, + "loss": 0.26, + "step": 628 + }, + { + "epoch": 1.296372523797273, + "grad_norm": 0.21543577313423157, + "learning_rate": 7.812142038946164e-06, + "loss": 0.2486, + "step": 629 + }, + { + "epoch": 1.298430666323643, + "grad_norm": 0.20984649658203125, + "learning_rate": 7.789232531500573e-06, + "loss": 0.2603, + "step": 630 + }, + { + "epoch": 1.3004888088500128, + "grad_norm": 0.20047229528427124, + "learning_rate": 7.766323024054984e-06, + "loss": 0.2559, + "step": 631 + }, + { + "epoch": 1.3025469513763828, + "grad_norm": 0.21747010946273804, + "learning_rate": 7.743413516609394e-06, + "loss": 0.2563, + "step": 632 + }, + { + "epoch": 1.3046050939027527, + "grad_norm": 0.20818108320236206, + "learning_rate": 7.720504009163803e-06, + "loss": 0.2507, + "step": 633 + }, + { + "epoch": 1.3066632364291229, + "grad_norm": 0.19827309250831604, + "learning_rate": 7.697594501718214e-06, + "loss": 0.2578, + "step": 634 + }, + { + "epoch": 1.3087213789554926, + "grad_norm": 0.2122543305158615, + "learning_rate": 7.674684994272624e-06, + "loss": 0.2633, + "step": 635 + }, + { + "epoch": 1.3107795214818627, + "grad_norm": 0.20870576798915863, + "learning_rate": 7.651775486827033e-06, + "loss": 0.2616, + "step": 636 + }, + { + "epoch": 1.3128376640082327, + "grad_norm": 0.2069362998008728, + "learning_rate": 7.628865979381444e-06, + "loss": 0.2426, + "step": 637 + }, + { + "epoch": 1.3148958065346026, + "grad_norm": 0.19999894499778748, + "learning_rate": 7.6059564719358535e-06, + "loss": 0.2547, + "step": 638 + }, + { + "epoch": 1.3169539490609725, + "grad_norm": 0.20518334209918976, + "learning_rate": 7.583046964490264e-06, + "loss": 0.2571, + "step": 639 + }, + { + "epoch": 1.3190120915873425, + "grad_norm": 0.20558986067771912, + "learning_rate": 7.560137457044674e-06, + "loss": 0.2483, + "step": 640 + }, + { + "epoch": 1.3210702341137124, + "grad_norm": 0.21443884074687958, + "learning_rate": 7.5372279495990845e-06, + "loss": 0.2494, + "step": 641 + }, + { + "epoch": 1.3231283766400823, + "grad_norm": 0.2025483101606369, + "learning_rate": 7.514318442153494e-06, + "loss": 0.2473, + "step": 642 + }, + { + "epoch": 1.3251865191664522, + "grad_norm": 0.21094976365566254, + "learning_rate": 7.491408934707905e-06, + "loss": 0.2603, + "step": 643 + }, + { + "epoch": 1.3272446616928222, + "grad_norm": 0.2047881782054901, + "learning_rate": 7.4684994272623145e-06, + "loss": 0.2601, + "step": 644 + }, + { + "epoch": 1.3293028042191921, + "grad_norm": 0.2075866013765335, + "learning_rate": 7.445589919816725e-06, + "loss": 0.2644, + "step": 645 + }, + { + "epoch": 1.331360946745562, + "grad_norm": 0.2174414098262787, + "learning_rate": 7.422680412371135e-06, + "loss": 0.2609, + "step": 646 + }, + { + "epoch": 1.3334190892719322, + "grad_norm": 0.20820266008377075, + "learning_rate": 7.3997709049255455e-06, + "loss": 0.2535, + "step": 647 + }, + { + "epoch": 1.335477231798302, + "grad_norm": 0.20941515266895294, + "learning_rate": 7.376861397479955e-06, + "loss": 0.2578, + "step": 648 + }, + { + "epoch": 1.337535374324672, + "grad_norm": 0.2027975171804428, + "learning_rate": 7.353951890034365e-06, + "loss": 0.2573, + "step": 649 + }, + { + "epoch": 1.339593516851042, + "grad_norm": 0.209550142288208, + "learning_rate": 7.331042382588775e-06, + "loss": 0.2513, + "step": 650 + }, + { + "epoch": 1.341651659377412, + "grad_norm": 0.21425557136535645, + "learning_rate": 7.3081328751431845e-06, + "loss": 0.2568, + "step": 651 + }, + { + "epoch": 1.3437098019037819, + "grad_norm": 0.22760476171970367, + "learning_rate": 7.285223367697595e-06, + "loss": 0.2549, + "step": 652 + }, + { + "epoch": 1.3457679444301518, + "grad_norm": 0.21329441666603088, + "learning_rate": 7.262313860252005e-06, + "loss": 0.2467, + "step": 653 + }, + { + "epoch": 1.3478260869565217, + "grad_norm": 0.20949490368366241, + "learning_rate": 7.239404352806415e-06, + "loss": 0.2569, + "step": 654 + }, + { + "epoch": 1.3498842294828917, + "grad_norm": 0.21022753417491913, + "learning_rate": 7.216494845360825e-06, + "loss": 0.2644, + "step": 655 + }, + { + "epoch": 1.3519423720092616, + "grad_norm": 0.20240676403045654, + "learning_rate": 7.193585337915236e-06, + "loss": 0.2561, + "step": 656 + }, + { + "epoch": 1.3540005145356315, + "grad_norm": 0.19892892241477966, + "learning_rate": 7.1706758304696455e-06, + "loss": 0.2564, + "step": 657 + }, + { + "epoch": 1.3560586570620017, + "grad_norm": 0.22104541957378387, + "learning_rate": 7.147766323024056e-06, + "loss": 0.2466, + "step": 658 + }, + { + "epoch": 1.3581167995883714, + "grad_norm": 0.2074560970067978, + "learning_rate": 7.124856815578466e-06, + "loss": 0.2634, + "step": 659 + }, + { + "epoch": 1.3601749421147415, + "grad_norm": 0.20596396923065186, + "learning_rate": 7.101947308132876e-06, + "loss": 0.2566, + "step": 660 + }, + { + "epoch": 1.3622330846411115, + "grad_norm": 0.2072969526052475, + "learning_rate": 7.079037800687286e-06, + "loss": 0.2603, + "step": 661 + }, + { + "epoch": 1.3642912271674814, + "grad_norm": 0.21680790185928345, + "learning_rate": 7.056128293241697e-06, + "loss": 0.2536, + "step": 662 + }, + { + "epoch": 1.3663493696938513, + "grad_norm": 0.2035921961069107, + "learning_rate": 7.0332187857961065e-06, + "loss": 0.2567, + "step": 663 + }, + { + "epoch": 1.3684075122202213, + "grad_norm": 0.21186605095863342, + "learning_rate": 7.010309278350515e-06, + "loss": 0.2575, + "step": 664 + }, + { + "epoch": 1.3704656547465912, + "grad_norm": 0.21388404071331024, + "learning_rate": 6.987399770904926e-06, + "loss": 0.2522, + "step": 665 + }, + { + "epoch": 1.3725237972729611, + "grad_norm": 0.21118783950805664, + "learning_rate": 6.964490263459336e-06, + "loss": 0.25, + "step": 666 + }, + { + "epoch": 1.374581939799331, + "grad_norm": 0.21162322163581848, + "learning_rate": 6.941580756013746e-06, + "loss": 0.253, + "step": 667 + }, + { + "epoch": 1.376640082325701, + "grad_norm": 0.21186329424381256, + "learning_rate": 6.918671248568156e-06, + "loss": 0.2589, + "step": 668 + }, + { + "epoch": 1.378698224852071, + "grad_norm": 0.21206888556480408, + "learning_rate": 6.895761741122567e-06, + "loss": 0.2629, + "step": 669 + }, + { + "epoch": 1.3807563673784409, + "grad_norm": 0.21045179665088654, + "learning_rate": 6.872852233676976e-06, + "loss": 0.2523, + "step": 670 + }, + { + "epoch": 1.382814509904811, + "grad_norm": 0.21106329560279846, + "learning_rate": 6.849942726231387e-06, + "loss": 0.2611, + "step": 671 + }, + { + "epoch": 1.3848726524311807, + "grad_norm": 0.20593757927417755, + "learning_rate": 6.827033218785797e-06, + "loss": 0.2537, + "step": 672 + }, + { + "epoch": 1.386930794957551, + "grad_norm": 0.2040368914604187, + "learning_rate": 6.804123711340207e-06, + "loss": 0.2545, + "step": 673 + }, + { + "epoch": 1.3889889374839208, + "grad_norm": 0.2148980051279068, + "learning_rate": 6.781214203894617e-06, + "loss": 0.264, + "step": 674 + }, + { + "epoch": 1.3910470800102908, + "grad_norm": 0.204456627368927, + "learning_rate": 6.758304696449028e-06, + "loss": 0.2609, + "step": 675 + }, + { + "epoch": 1.3931052225366607, + "grad_norm": 0.20230846107006073, + "learning_rate": 6.735395189003437e-06, + "loss": 0.2644, + "step": 676 + }, + { + "epoch": 1.3951633650630306, + "grad_norm": 0.205158531665802, + "learning_rate": 6.712485681557846e-06, + "loss": 0.2611, + "step": 677 + }, + { + "epoch": 1.3972215075894006, + "grad_norm": 0.21487553417682648, + "learning_rate": 6.689576174112257e-06, + "loss": 0.2492, + "step": 678 + }, + { + "epoch": 1.3992796501157705, + "grad_norm": 0.21277402341365814, + "learning_rate": 6.666666666666667e-06, + "loss": 0.2491, + "step": 679 + }, + { + "epoch": 1.4013377926421404, + "grad_norm": 0.2049219310283661, + "learning_rate": 6.643757159221077e-06, + "loss": 0.2444, + "step": 680 + }, + { + "epoch": 1.4033959351685104, + "grad_norm": 0.23122920095920563, + "learning_rate": 6.620847651775487e-06, + "loss": 0.2522, + "step": 681 + }, + { + "epoch": 1.4054540776948803, + "grad_norm": 0.2067662477493286, + "learning_rate": 6.597938144329898e-06, + "loss": 0.2583, + "step": 682 + }, + { + "epoch": 1.4075122202212502, + "grad_norm": 0.2043958306312561, + "learning_rate": 6.575028636884307e-06, + "loss": 0.2603, + "step": 683 + }, + { + "epoch": 1.4095703627476204, + "grad_norm": 0.21982067823410034, + "learning_rate": 6.552119129438718e-06, + "loss": 0.246, + "step": 684 + }, + { + "epoch": 1.41162850527399, + "grad_norm": 0.21510522067546844, + "learning_rate": 6.529209621993128e-06, + "loss": 0.2554, + "step": 685 + }, + { + "epoch": 1.4136866478003602, + "grad_norm": 0.24448052048683167, + "learning_rate": 6.506300114547538e-06, + "loss": 0.256, + "step": 686 + }, + { + "epoch": 1.4157447903267302, + "grad_norm": 0.2068399339914322, + "learning_rate": 6.483390607101948e-06, + "loss": 0.2566, + "step": 687 + }, + { + "epoch": 1.4178029328531, + "grad_norm": 0.20870736241340637, + "learning_rate": 6.460481099656359e-06, + "loss": 0.2493, + "step": 688 + }, + { + "epoch": 1.41986107537947, + "grad_norm": 0.22065278887748718, + "learning_rate": 6.437571592210768e-06, + "loss": 0.2566, + "step": 689 + }, + { + "epoch": 1.42191921790584, + "grad_norm": 0.21523869037628174, + "learning_rate": 6.414662084765179e-06, + "loss": 0.2579, + "step": 690 + }, + { + "epoch": 1.42397736043221, + "grad_norm": 0.21578392386436462, + "learning_rate": 6.391752577319588e-06, + "loss": 0.2555, + "step": 691 + }, + { + "epoch": 1.4260355029585798, + "grad_norm": 0.2096480280160904, + "learning_rate": 6.3688430698739976e-06, + "loss": 0.2534, + "step": 692 + }, + { + "epoch": 1.4280936454849498, + "grad_norm": 0.21274186670780182, + "learning_rate": 6.345933562428408e-06, + "loss": 0.2521, + "step": 693 + }, + { + "epoch": 1.4301517880113197, + "grad_norm": 0.21426336467266083, + "learning_rate": 6.323024054982818e-06, + "loss": 0.2589, + "step": 694 + }, + { + "epoch": 1.4322099305376899, + "grad_norm": 0.21294309198856354, + "learning_rate": 6.3001145475372285e-06, + "loss": 0.2615, + "step": 695 + }, + { + "epoch": 1.4342680730640596, + "grad_norm": 0.2021908164024353, + "learning_rate": 6.277205040091638e-06, + "loss": 0.2714, + "step": 696 + }, + { + "epoch": 1.4363262155904297, + "grad_norm": 0.21605439484119415, + "learning_rate": 6.254295532646049e-06, + "loss": 0.2592, + "step": 697 + }, + { + "epoch": 1.4383843581167997, + "grad_norm": 0.2154022753238678, + "learning_rate": 6.231386025200459e-06, + "loss": 0.2633, + "step": 698 + }, + { + "epoch": 1.4404425006431696, + "grad_norm": 0.2178344875574112, + "learning_rate": 6.208476517754869e-06, + "loss": 0.2685, + "step": 699 + }, + { + "epoch": 1.4425006431695395, + "grad_norm": 0.21423941850662231, + "learning_rate": 6.185567010309279e-06, + "loss": 0.2474, + "step": 700 + }, + { + "epoch": 1.4425006431695395, + "eval_loss": 0.27773216366767883, + "eval_runtime": 2423.2314, + "eval_samples_per_second": 3.208, + "eval_steps_per_second": 0.802, + "step": 700 + } + ], + "logging_steps": 1, + "max_steps": 970, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.3131842998809313e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/training_args.bin b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5999c7ee9dd10ee9076d748e4757533e635fa832 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee55a11f5a306eb7c39b536fdfe2459bc279e468da50f6adda478c4deffcb812 +size 5688 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/README.md b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0d77d70fdc5c829c8889cb85828736b7eb9714 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/codegemma-7b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/adapter_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e841602c6a59fc7b085ac647af4d4c312445d261 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/codegemma-7b-it", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "q_proj", + "down_proj", + "gate_proj", + "k_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/adapter_model.safetensors b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..93ed93f16d7cc22706cd8ba2ac0aae7724069906 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dde38d33ecf148cae06bc20585f6bab2d7d9c730846a639a518c88bc05c9a25 +size 800116456 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/optimizer.pt b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7375cb3ace3d95d9c2c42e9bf61944800382e06 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba5e5b3e987c1bee1384829546b0c1c175d9ad4459cedeb96504d6fc0cfedafb +size 406743860 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/rng_state.pth b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..86e68de0b3d88501794755f34a31dc0a038ae3df --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc46ebdf2786cf45a03ecdc2890904fdbf8f1b3e4c6aa053d7e714f2d7a72b2b +size 14244 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/scheduler.pt b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..44992836a5d8fa6598d04ce812d7b6a68148e08b --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08064e2e1d440c2532400e643af300342819cba493195863a00c04cebbe7bca8 +size 1064 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/special_tokens_map.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/tokenizer.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..45a5e23f54141c5f4f97a8d58f3ffadc28e287ba --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d964a2c8346d40f95791533eae48730d5f163c2e65fd16333560fd3e661df318 +size 34362915 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/tokenizer.model b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..71a98ce40269d847e58957e1e070d9ae8eb184af --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:583f2ebd2a1936009b7da991ea255504db68c7a9713a78673d1335a87098966c +size 4241023 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/tokenizer_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9b9b1b4acdd4afcedae39d1cf6f0bc7ef7d9910f --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/tokenizer_config.json @@ -0,0 +1,2011 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "<|file_separator|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "left", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/trainer_state.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3d9cbaa7a95cf26b1da35d0761bd1fea6f134ed4 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/trainer_state.json @@ -0,0 +1,5698 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.6483148958065346, + "eval_steps": 100, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0020581425263699513, + "grad_norm": 11.994463920593262, + "learning_rate": 2.061855670103093e-07, + "loss": 2.91, + "step": 1 + }, + { + "epoch": 0.004116285052739903, + "grad_norm": 11.769092559814453, + "learning_rate": 4.123711340206186e-07, + "loss": 2.8686, + "step": 2 + }, + { + "epoch": 0.0061744275791098535, + "grad_norm": 13.05551815032959, + "learning_rate": 6.185567010309279e-07, + "loss": 3.0286, + "step": 3 + }, + { + "epoch": 0.008232570105479805, + "grad_norm": 12.334521293640137, + "learning_rate": 8.247422680412372e-07, + "loss": 2.904, + "step": 4 + }, + { + "epoch": 0.010290712631849755, + "grad_norm": 12.075353622436523, + "learning_rate": 1.0309278350515464e-06, + "loss": 2.8991, + "step": 5 + }, + { + "epoch": 0.012348855158219707, + "grad_norm": 11.86032485961914, + "learning_rate": 1.2371134020618557e-06, + "loss": 3.0007, + "step": 6 + }, + { + "epoch": 0.014406997684589657, + "grad_norm": 10.10457992553711, + "learning_rate": 1.4432989690721649e-06, + "loss": 2.8493, + "step": 7 + }, + { + "epoch": 0.01646514021095961, + "grad_norm": 8.56408405303955, + "learning_rate": 1.6494845360824744e-06, + "loss": 2.9573, + "step": 8 + }, + { + "epoch": 0.01852328273732956, + "grad_norm": 6.307392120361328, + "learning_rate": 1.8556701030927837e-06, + "loss": 2.9507, + "step": 9 + }, + { + "epoch": 0.02058142526369951, + "grad_norm": 4.276430130004883, + "learning_rate": 2.061855670103093e-06, + "loss": 2.8988, + "step": 10 + }, + { + "epoch": 0.022639567790069464, + "grad_norm": 2.5912015438079834, + "learning_rate": 2.268041237113402e-06, + "loss": 2.9926, + "step": 11 + }, + { + "epoch": 0.024697710316439414, + "grad_norm": 2.018446207046509, + "learning_rate": 2.4742268041237115e-06, + "loss": 2.9874, + "step": 12 + }, + { + "epoch": 0.026755852842809364, + "grad_norm": 1.8558588027954102, + "learning_rate": 2.680412371134021e-06, + "loss": 2.8608, + "step": 13 + }, + { + "epoch": 0.028813995369179314, + "grad_norm": 1.9658265113830566, + "learning_rate": 2.8865979381443297e-06, + "loss": 2.8596, + "step": 14 + }, + { + "epoch": 0.030872137895549268, + "grad_norm": 1.872044563293457, + "learning_rate": 3.0927835051546395e-06, + "loss": 2.8836, + "step": 15 + }, + { + "epoch": 0.03293028042191922, + "grad_norm": 1.8884096145629883, + "learning_rate": 3.298969072164949e-06, + "loss": 2.9383, + "step": 16 + }, + { + "epoch": 0.03498842294828917, + "grad_norm": 1.8795744180679321, + "learning_rate": 3.5051546391752577e-06, + "loss": 2.883, + "step": 17 + }, + { + "epoch": 0.03704656547465912, + "grad_norm": 1.783678412437439, + "learning_rate": 3.7113402061855674e-06, + "loss": 2.8019, + "step": 18 + }, + { + "epoch": 0.039104708001029075, + "grad_norm": 1.820617914199829, + "learning_rate": 3.917525773195877e-06, + "loss": 2.8813, + "step": 19 + }, + { + "epoch": 0.04116285052739902, + "grad_norm": 1.8188731670379639, + "learning_rate": 4.123711340206186e-06, + "loss": 2.8401, + "step": 20 + }, + { + "epoch": 0.043220993053768975, + "grad_norm": 1.7305251359939575, + "learning_rate": 4.329896907216495e-06, + "loss": 2.7478, + "step": 21 + }, + { + "epoch": 0.04527913558013893, + "grad_norm": 1.7014551162719727, + "learning_rate": 4.536082474226804e-06, + "loss": 2.7356, + "step": 22 + }, + { + "epoch": 0.047337278106508875, + "grad_norm": 1.677381157875061, + "learning_rate": 4.742268041237113e-06, + "loss": 2.7593, + "step": 23 + }, + { + "epoch": 0.04939542063287883, + "grad_norm": 1.628554344177246, + "learning_rate": 4.948453608247423e-06, + "loss": 2.7689, + "step": 24 + }, + { + "epoch": 0.051453563159248775, + "grad_norm": 1.4968128204345703, + "learning_rate": 5.154639175257732e-06, + "loss": 2.6613, + "step": 25 + }, + { + "epoch": 0.05351170568561873, + "grad_norm": 1.4734832048416138, + "learning_rate": 5.360824742268042e-06, + "loss": 2.7095, + "step": 26 + }, + { + "epoch": 0.05556984821198868, + "grad_norm": 1.3745571374893188, + "learning_rate": 5.567010309278351e-06, + "loss": 2.655, + "step": 27 + }, + { + "epoch": 0.05762799073835863, + "grad_norm": 1.3381729125976562, + "learning_rate": 5.7731958762886594e-06, + "loss": 2.55, + "step": 28 + }, + { + "epoch": 0.05968613326472858, + "grad_norm": 1.3388073444366455, + "learning_rate": 5.979381443298969e-06, + "loss": 2.5219, + "step": 29 + }, + { + "epoch": 0.061744275791098535, + "grad_norm": 1.317008376121521, + "learning_rate": 6.185567010309279e-06, + "loss": 2.4491, + "step": 30 + }, + { + "epoch": 0.06380241831746848, + "grad_norm": 1.3210794925689697, + "learning_rate": 6.391752577319588e-06, + "loss": 2.4358, + "step": 31 + }, + { + "epoch": 0.06586056084383844, + "grad_norm": 1.182519555091858, + "learning_rate": 6.597938144329898e-06, + "loss": 2.4514, + "step": 32 + }, + { + "epoch": 0.06791870337020839, + "grad_norm": 1.2238099575042725, + "learning_rate": 6.804123711340207e-06, + "loss": 2.442, + "step": 33 + }, + { + "epoch": 0.06997684589657834, + "grad_norm": 1.1793314218521118, + "learning_rate": 7.010309278350515e-06, + "loss": 2.3864, + "step": 34 + }, + { + "epoch": 0.0720349884229483, + "grad_norm": 1.1983020305633545, + "learning_rate": 7.216494845360825e-06, + "loss": 2.3796, + "step": 35 + }, + { + "epoch": 0.07409313094931824, + "grad_norm": 1.2189652919769287, + "learning_rate": 7.422680412371135e-06, + "loss": 2.4152, + "step": 36 + }, + { + "epoch": 0.07615127347568819, + "grad_norm": 1.14923095703125, + "learning_rate": 7.628865979381444e-06, + "loss": 2.3298, + "step": 37 + }, + { + "epoch": 0.07820941600205815, + "grad_norm": 1.147013545036316, + "learning_rate": 7.835051546391754e-06, + "loss": 2.2488, + "step": 38 + }, + { + "epoch": 0.0802675585284281, + "grad_norm": 1.133981466293335, + "learning_rate": 8.041237113402063e-06, + "loss": 2.1825, + "step": 39 + }, + { + "epoch": 0.08232570105479804, + "grad_norm": 1.1686867475509644, + "learning_rate": 8.247422680412371e-06, + "loss": 2.2282, + "step": 40 + }, + { + "epoch": 0.084383843581168, + "grad_norm": 1.131690502166748, + "learning_rate": 8.453608247422681e-06, + "loss": 2.0962, + "step": 41 + }, + { + "epoch": 0.08644198610753795, + "grad_norm": 1.1626195907592773, + "learning_rate": 8.65979381443299e-06, + "loss": 2.1161, + "step": 42 + }, + { + "epoch": 0.0885001286339079, + "grad_norm": 1.1508581638336182, + "learning_rate": 8.865979381443299e-06, + "loss": 1.9856, + "step": 43 + }, + { + "epoch": 0.09055827116027786, + "grad_norm": 1.2286733388900757, + "learning_rate": 9.072164948453609e-06, + "loss": 2.076, + "step": 44 + }, + { + "epoch": 0.0926164136866478, + "grad_norm": 1.82068932056427, + "learning_rate": 9.278350515463918e-06, + "loss": 1.9995, + "step": 45 + }, + { + "epoch": 0.09467455621301775, + "grad_norm": 2.079101324081421, + "learning_rate": 9.484536082474226e-06, + "loss": 1.9601, + "step": 46 + }, + { + "epoch": 0.0967326987393877, + "grad_norm": 1.1209226846694946, + "learning_rate": 9.690721649484536e-06, + "loss": 1.9346, + "step": 47 + }, + { + "epoch": 0.09879084126575766, + "grad_norm": 1.0579711198806763, + "learning_rate": 9.896907216494846e-06, + "loss": 1.8764, + "step": 48 + }, + { + "epoch": 0.1008489837921276, + "grad_norm": 1.0434011220932007, + "learning_rate": 1.0103092783505156e-05, + "loss": 1.8483, + "step": 49 + }, + { + "epoch": 0.10290712631849755, + "grad_norm": 1.0089991092681885, + "learning_rate": 1.0309278350515464e-05, + "loss": 1.8018, + "step": 50 + }, + { + "epoch": 0.10496526884486751, + "grad_norm": 1.0117324590682983, + "learning_rate": 1.0515463917525775e-05, + "loss": 1.8003, + "step": 51 + }, + { + "epoch": 0.10702341137123746, + "grad_norm": 1.0006697177886963, + "learning_rate": 1.0721649484536083e-05, + "loss": 1.7482, + "step": 52 + }, + { + "epoch": 0.1090815538976074, + "grad_norm": 2.1164329051971436, + "learning_rate": 1.0927835051546391e-05, + "loss": 1.7363, + "step": 53 + }, + { + "epoch": 0.11113969642397736, + "grad_norm": 0.9573502540588379, + "learning_rate": 1.1134020618556703e-05, + "loss": 1.661, + "step": 54 + }, + { + "epoch": 0.11319783895034731, + "grad_norm": 1.0059764385223389, + "learning_rate": 1.134020618556701e-05, + "loss": 1.6979, + "step": 55 + }, + { + "epoch": 0.11525598147671726, + "grad_norm": 0.9719656109809875, + "learning_rate": 1.1546391752577319e-05, + "loss": 1.6318, + "step": 56 + }, + { + "epoch": 0.11731412400308722, + "grad_norm": 1.0024539232254028, + "learning_rate": 1.175257731958763e-05, + "loss": 1.6283, + "step": 57 + }, + { + "epoch": 0.11937226652945716, + "grad_norm": 0.9772456288337708, + "learning_rate": 1.1958762886597938e-05, + "loss": 1.5611, + "step": 58 + }, + { + "epoch": 0.12143040905582711, + "grad_norm": 0.9947625994682312, + "learning_rate": 1.2164948453608248e-05, + "loss": 1.6073, + "step": 59 + }, + { + "epoch": 0.12348855158219707, + "grad_norm": 2.112889051437378, + "learning_rate": 1.2371134020618558e-05, + "loss": 1.6208, + "step": 60 + }, + { + "epoch": 0.12554669410856703, + "grad_norm": 1.0515345335006714, + "learning_rate": 1.2577319587628866e-05, + "loss": 1.569, + "step": 61 + }, + { + "epoch": 0.12760483663493696, + "grad_norm": 1.0782145261764526, + "learning_rate": 1.2783505154639176e-05, + "loss": 1.5097, + "step": 62 + }, + { + "epoch": 0.12966297916130692, + "grad_norm": 1.154104232788086, + "learning_rate": 1.2989690721649485e-05, + "loss": 1.5472, + "step": 63 + }, + { + "epoch": 0.13172112168767688, + "grad_norm": 1.1614656448364258, + "learning_rate": 1.3195876288659795e-05, + "loss": 1.4833, + "step": 64 + }, + { + "epoch": 0.13377926421404682, + "grad_norm": 1.1720911264419556, + "learning_rate": 1.3402061855670103e-05, + "loss": 1.4644, + "step": 65 + }, + { + "epoch": 0.13583740674041678, + "grad_norm": 1.8903896808624268, + "learning_rate": 1.3608247422680415e-05, + "loss": 1.4286, + "step": 66 + }, + { + "epoch": 0.13789554926678674, + "grad_norm": 1.2675013542175293, + "learning_rate": 1.3814432989690723e-05, + "loss": 1.416, + "step": 67 + }, + { + "epoch": 0.13995369179315667, + "grad_norm": 1.266434907913208, + "learning_rate": 1.402061855670103e-05, + "loss": 1.3171, + "step": 68 + }, + { + "epoch": 0.14201183431952663, + "grad_norm": 1.3408889770507812, + "learning_rate": 1.4226804123711342e-05, + "loss": 1.3396, + "step": 69 + }, + { + "epoch": 0.1440699768458966, + "grad_norm": 1.3862446546554565, + "learning_rate": 1.443298969072165e-05, + "loss": 1.2642, + "step": 70 + }, + { + "epoch": 0.14612811937226652, + "grad_norm": 2.110553026199341, + "learning_rate": 1.4639175257731958e-05, + "loss": 1.2593, + "step": 71 + }, + { + "epoch": 0.14818626189863648, + "grad_norm": 1.7017499208450317, + "learning_rate": 1.484536082474227e-05, + "loss": 1.24, + "step": 72 + }, + { + "epoch": 0.15024440442500644, + "grad_norm": 1.9851700067520142, + "learning_rate": 1.5051546391752578e-05, + "loss": 1.2313, + "step": 73 + }, + { + "epoch": 0.15230254695137638, + "grad_norm": 2.009608030319214, + "learning_rate": 1.5257731958762888e-05, + "loss": 1.1281, + "step": 74 + }, + { + "epoch": 0.15436068947774634, + "grad_norm": 2.7587485313415527, + "learning_rate": 1.5463917525773197e-05, + "loss": 1.1248, + "step": 75 + }, + { + "epoch": 0.1564188320041163, + "grad_norm": 2.780954599380493, + "learning_rate": 1.5670103092783507e-05, + "loss": 1.0797, + "step": 76 + }, + { + "epoch": 0.15847697453048623, + "grad_norm": 3.1470866203308105, + "learning_rate": 1.5876288659793813e-05, + "loss": 1.0064, + "step": 77 + }, + { + "epoch": 0.1605351170568562, + "grad_norm": 4.653595447540283, + "learning_rate": 1.6082474226804127e-05, + "loss": 0.9219, + "step": 78 + }, + { + "epoch": 0.16259325958322615, + "grad_norm": 4.157363414764404, + "learning_rate": 1.6288659793814433e-05, + "loss": 0.8709, + "step": 79 + }, + { + "epoch": 0.16465140210959608, + "grad_norm": 4.5814924240112305, + "learning_rate": 1.6494845360824743e-05, + "loss": 0.7693, + "step": 80 + }, + { + "epoch": 0.16670954463596604, + "grad_norm": 5.096139907836914, + "learning_rate": 1.6701030927835052e-05, + "loss": 0.6868, + "step": 81 + }, + { + "epoch": 0.168767687162336, + "grad_norm": 4.858880519866943, + "learning_rate": 1.6907216494845362e-05, + "loss": 0.5971, + "step": 82 + }, + { + "epoch": 0.17082582968870594, + "grad_norm": 4.42564582824707, + "learning_rate": 1.7113402061855672e-05, + "loss": 0.4719, + "step": 83 + }, + { + "epoch": 0.1728839722150759, + "grad_norm": 7.720851421356201, + "learning_rate": 1.731958762886598e-05, + "loss": 0.3943, + "step": 84 + }, + { + "epoch": 0.17494211474144586, + "grad_norm": 0.41923192143440247, + "learning_rate": 1.752577319587629e-05, + "loss": 0.3635, + "step": 85 + }, + { + "epoch": 0.1770002572678158, + "grad_norm": 0.2771846354007721, + "learning_rate": 1.7731958762886598e-05, + "loss": 0.3597, + "step": 86 + }, + { + "epoch": 0.17905839979418575, + "grad_norm": 0.24761857092380524, + "learning_rate": 1.793814432989691e-05, + "loss": 0.3735, + "step": 87 + }, + { + "epoch": 0.1811165423205557, + "grad_norm": 0.23277048766613007, + "learning_rate": 1.8144329896907217e-05, + "loss": 0.3643, + "step": 88 + }, + { + "epoch": 0.18317468484692565, + "grad_norm": 0.22931228578090668, + "learning_rate": 1.8350515463917527e-05, + "loss": 0.3519, + "step": 89 + }, + { + "epoch": 0.1852328273732956, + "grad_norm": 0.20750615000724792, + "learning_rate": 1.8556701030927837e-05, + "loss": 0.3431, + "step": 90 + }, + { + "epoch": 0.18729096989966554, + "grad_norm": 0.2080322951078415, + "learning_rate": 1.8762886597938147e-05, + "loss": 0.3632, + "step": 91 + }, + { + "epoch": 0.1893491124260355, + "grad_norm": 0.20186181366443634, + "learning_rate": 1.8969072164948453e-05, + "loss": 0.3492, + "step": 92 + }, + { + "epoch": 0.19140725495240546, + "grad_norm": 0.19172786176204681, + "learning_rate": 1.9175257731958766e-05, + "loss": 0.3552, + "step": 93 + }, + { + "epoch": 0.1934653974787754, + "grad_norm": 0.1747850626707077, + "learning_rate": 1.9381443298969072e-05, + "loss": 0.3355, + "step": 94 + }, + { + "epoch": 0.19552354000514535, + "grad_norm": 0.196411594748497, + "learning_rate": 1.9587628865979382e-05, + "loss": 0.3271, + "step": 95 + }, + { + "epoch": 0.1975816825315153, + "grad_norm": 0.20063228905200958, + "learning_rate": 1.9793814432989692e-05, + "loss": 0.3351, + "step": 96 + }, + { + "epoch": 0.19963982505788525, + "grad_norm": 0.19240939617156982, + "learning_rate": 2e-05, + "loss": 0.3266, + "step": 97 + }, + { + "epoch": 0.2016979675842552, + "grad_norm": 0.18206572532653809, + "learning_rate": 1.997709049255441e-05, + "loss": 0.3393, + "step": 98 + }, + { + "epoch": 0.20375611011062517, + "grad_norm": 0.20384562015533447, + "learning_rate": 1.9954180985108823e-05, + "loss": 0.3395, + "step": 99 + }, + { + "epoch": 0.2058142526369951, + "grad_norm": 0.19944581389427185, + "learning_rate": 1.9931271477663232e-05, + "loss": 0.3268, + "step": 100 + }, + { + "epoch": 0.2058142526369951, + "eval_loss": 0.3456890285015106, + "eval_runtime": 2114.0178, + "eval_samples_per_second": 3.677, + "eval_steps_per_second": 0.92, + "step": 100 + }, + { + "epoch": 0.20787239516336506, + "grad_norm": 0.17743557691574097, + "learning_rate": 1.990836197021764e-05, + "loss": 0.3439, + "step": 101 + }, + { + "epoch": 0.20993053768973502, + "grad_norm": 0.18746449053287506, + "learning_rate": 1.9885452462772053e-05, + "loss": 0.326, + "step": 102 + }, + { + "epoch": 0.21198868021610495, + "grad_norm": 0.18555815517902374, + "learning_rate": 1.9862542955326462e-05, + "loss": 0.3337, + "step": 103 + }, + { + "epoch": 0.2140468227424749, + "grad_norm": 0.16591575741767883, + "learning_rate": 1.9839633447880874e-05, + "loss": 0.3121, + "step": 104 + }, + { + "epoch": 0.21610496526884487, + "grad_norm": 0.1621987372636795, + "learning_rate": 1.9816723940435283e-05, + "loss": 0.3287, + "step": 105 + }, + { + "epoch": 0.2181631077952148, + "grad_norm": 0.1614532470703125, + "learning_rate": 1.9793814432989692e-05, + "loss": 0.3306, + "step": 106 + }, + { + "epoch": 0.22022125032158477, + "grad_norm": 0.17993387579917908, + "learning_rate": 1.9770904925544104e-05, + "loss": 0.3341, + "step": 107 + }, + { + "epoch": 0.22227939284795473, + "grad_norm": 0.1550011783838272, + "learning_rate": 1.9747995418098513e-05, + "loss": 0.3197, + "step": 108 + }, + { + "epoch": 0.22433753537432466, + "grad_norm": 0.18471524119377136, + "learning_rate": 1.9725085910652922e-05, + "loss": 0.3285, + "step": 109 + }, + { + "epoch": 0.22639567790069462, + "grad_norm": 0.15604373812675476, + "learning_rate": 1.9702176403207334e-05, + "loss": 0.3298, + "step": 110 + }, + { + "epoch": 0.22845382042706458, + "grad_norm": 0.1682298630475998, + "learning_rate": 1.9679266895761743e-05, + "loss": 0.3343, + "step": 111 + }, + { + "epoch": 0.2305119629534345, + "grad_norm": 0.14933635294437408, + "learning_rate": 1.9656357388316152e-05, + "loss": 0.3134, + "step": 112 + }, + { + "epoch": 0.23257010547980447, + "grad_norm": 0.14892347157001495, + "learning_rate": 1.963344788087056e-05, + "loss": 0.3154, + "step": 113 + }, + { + "epoch": 0.23462824800617443, + "grad_norm": 0.1577889323234558, + "learning_rate": 1.9610538373424973e-05, + "loss": 0.3122, + "step": 114 + }, + { + "epoch": 0.23668639053254437, + "grad_norm": 0.16482344269752502, + "learning_rate": 1.9587628865979382e-05, + "loss": 0.3193, + "step": 115 + }, + { + "epoch": 0.23874453305891433, + "grad_norm": 0.15328913927078247, + "learning_rate": 1.956471935853379e-05, + "loss": 0.3217, + "step": 116 + }, + { + "epoch": 0.2408026755852843, + "grad_norm": 0.16140656173229218, + "learning_rate": 1.9541809851088203e-05, + "loss": 0.318, + "step": 117 + }, + { + "epoch": 0.24286081811165422, + "grad_norm": 0.15448373556137085, + "learning_rate": 1.9518900343642612e-05, + "loss": 0.3205, + "step": 118 + }, + { + "epoch": 0.24491896063802418, + "grad_norm": 0.14716887474060059, + "learning_rate": 1.9495990836197025e-05, + "loss": 0.3164, + "step": 119 + }, + { + "epoch": 0.24697710316439414, + "grad_norm": 0.16582027077674866, + "learning_rate": 1.9473081328751433e-05, + "loss": 0.3191, + "step": 120 + }, + { + "epoch": 0.24903524569076407, + "grad_norm": 0.15213699638843536, + "learning_rate": 1.9450171821305842e-05, + "loss": 0.304, + "step": 121 + }, + { + "epoch": 0.25109338821713406, + "grad_norm": 0.1659238487482071, + "learning_rate": 1.9427262313860255e-05, + "loss": 0.3184, + "step": 122 + }, + { + "epoch": 0.253151530743504, + "grad_norm": 0.15596656501293182, + "learning_rate": 1.9404352806414663e-05, + "loss": 0.3092, + "step": 123 + }, + { + "epoch": 0.2552096732698739, + "grad_norm": 0.15868476033210754, + "learning_rate": 1.9381443298969072e-05, + "loss": 0.3163, + "step": 124 + }, + { + "epoch": 0.2572678157962439, + "grad_norm": 0.15386095643043518, + "learning_rate": 1.9358533791523485e-05, + "loss": 0.3049, + "step": 125 + }, + { + "epoch": 0.25932595832261385, + "grad_norm": 0.15179213881492615, + "learning_rate": 1.9335624284077894e-05, + "loss": 0.3131, + "step": 126 + }, + { + "epoch": 0.2613841008489838, + "grad_norm": 0.1595134735107422, + "learning_rate": 1.9312714776632306e-05, + "loss": 0.3069, + "step": 127 + }, + { + "epoch": 0.26344224337535377, + "grad_norm": 0.16989803314208984, + "learning_rate": 1.9289805269186715e-05, + "loss": 0.3052, + "step": 128 + }, + { + "epoch": 0.2655003859017237, + "grad_norm": 0.14803892374038696, + "learning_rate": 1.9266895761741124e-05, + "loss": 0.3065, + "step": 129 + }, + { + "epoch": 0.26755852842809363, + "grad_norm": 0.16676583886146545, + "learning_rate": 1.9243986254295536e-05, + "loss": 0.2962, + "step": 130 + }, + { + "epoch": 0.2696166709544636, + "grad_norm": 0.15694552659988403, + "learning_rate": 1.9221076746849945e-05, + "loss": 0.3096, + "step": 131 + }, + { + "epoch": 0.27167481348083355, + "grad_norm": 0.17696696519851685, + "learning_rate": 1.9198167239404354e-05, + "loss": 0.3145, + "step": 132 + }, + { + "epoch": 0.2737329560072035, + "grad_norm": 0.17204038798809052, + "learning_rate": 1.9175257731958766e-05, + "loss": 0.3248, + "step": 133 + }, + { + "epoch": 0.2757910985335735, + "grad_norm": 0.15630359947681427, + "learning_rate": 1.9152348224513175e-05, + "loss": 0.3117, + "step": 134 + }, + { + "epoch": 0.2778492410599434, + "grad_norm": 0.15757997334003448, + "learning_rate": 1.9129438717067584e-05, + "loss": 0.3145, + "step": 135 + }, + { + "epoch": 0.27990738358631334, + "grad_norm": 0.16273653507232666, + "learning_rate": 1.9106529209621996e-05, + "loss": 0.3159, + "step": 136 + }, + { + "epoch": 0.28196552611268333, + "grad_norm": 0.16213104128837585, + "learning_rate": 1.9083619702176405e-05, + "loss": 0.2949, + "step": 137 + }, + { + "epoch": 0.28402366863905326, + "grad_norm": 0.15377865731716156, + "learning_rate": 1.9060710194730814e-05, + "loss": 0.306, + "step": 138 + }, + { + "epoch": 0.2860818111654232, + "grad_norm": 0.1545962244272232, + "learning_rate": 1.9037800687285223e-05, + "loss": 0.2966, + "step": 139 + }, + { + "epoch": 0.2881399536917932, + "grad_norm": 0.15516617894172668, + "learning_rate": 1.9014891179839635e-05, + "loss": 0.3122, + "step": 140 + }, + { + "epoch": 0.2901980962181631, + "grad_norm": 0.14734458923339844, + "learning_rate": 1.8991981672394044e-05, + "loss": 0.3118, + "step": 141 + }, + { + "epoch": 0.29225623874453305, + "grad_norm": 0.1644304096698761, + "learning_rate": 1.8969072164948453e-05, + "loss": 0.3027, + "step": 142 + }, + { + "epoch": 0.29431438127090304, + "grad_norm": 0.14632569253444672, + "learning_rate": 1.8946162657502865e-05, + "loss": 0.3023, + "step": 143 + }, + { + "epoch": 0.29637252379727297, + "grad_norm": 0.1573137789964676, + "learning_rate": 1.8923253150057274e-05, + "loss": 0.3102, + "step": 144 + }, + { + "epoch": 0.2984306663236429, + "grad_norm": 0.16423144936561584, + "learning_rate": 1.8900343642611686e-05, + "loss": 0.3033, + "step": 145 + }, + { + "epoch": 0.3004888088500129, + "grad_norm": 0.15420907735824585, + "learning_rate": 1.8877434135166095e-05, + "loss": 0.3089, + "step": 146 + }, + { + "epoch": 0.3025469513763828, + "grad_norm": 0.1579178273677826, + "learning_rate": 1.8854524627720504e-05, + "loss": 0.3071, + "step": 147 + }, + { + "epoch": 0.30460509390275275, + "grad_norm": 0.15866397321224213, + "learning_rate": 1.8831615120274916e-05, + "loss": 0.3083, + "step": 148 + }, + { + "epoch": 0.30666323642912274, + "grad_norm": 0.16651487350463867, + "learning_rate": 1.8808705612829325e-05, + "loss": 0.3099, + "step": 149 + }, + { + "epoch": 0.3087213789554927, + "grad_norm": 0.16281908750534058, + "learning_rate": 1.8785796105383734e-05, + "loss": 0.3034, + "step": 150 + }, + { + "epoch": 0.3107795214818626, + "grad_norm": 0.17449837923049927, + "learning_rate": 1.8762886597938147e-05, + "loss": 0.3054, + "step": 151 + }, + { + "epoch": 0.3128376640082326, + "grad_norm": 0.15403546392917633, + "learning_rate": 1.8739977090492555e-05, + "loss": 0.297, + "step": 152 + }, + { + "epoch": 0.31489580653460253, + "grad_norm": 0.1472466140985489, + "learning_rate": 1.8717067583046968e-05, + "loss": 0.2973, + "step": 153 + }, + { + "epoch": 0.31695394906097246, + "grad_norm": 0.16027937829494476, + "learning_rate": 1.8694158075601377e-05, + "loss": 0.3054, + "step": 154 + }, + { + "epoch": 0.31901209158734245, + "grad_norm": 0.17086225748062134, + "learning_rate": 1.8671248568155786e-05, + "loss": 0.307, + "step": 155 + }, + { + "epoch": 0.3210702341137124, + "grad_norm": 0.15930697321891785, + "learning_rate": 1.8648339060710198e-05, + "loss": 0.293, + "step": 156 + }, + { + "epoch": 0.3231283766400823, + "grad_norm": 0.17086376249790192, + "learning_rate": 1.8625429553264607e-05, + "loss": 0.293, + "step": 157 + }, + { + "epoch": 0.3251865191664523, + "grad_norm": 0.15970875322818756, + "learning_rate": 1.8602520045819016e-05, + "loss": 0.3083, + "step": 158 + }, + { + "epoch": 0.32724466169282224, + "grad_norm": 0.16355909407138824, + "learning_rate": 1.8579610538373428e-05, + "loss": 0.3139, + "step": 159 + }, + { + "epoch": 0.32930280421919217, + "grad_norm": 0.15183711051940918, + "learning_rate": 1.8556701030927837e-05, + "loss": 0.2953, + "step": 160 + }, + { + "epoch": 0.33136094674556216, + "grad_norm": 0.15123715996742249, + "learning_rate": 1.853379152348225e-05, + "loss": 0.3025, + "step": 161 + }, + { + "epoch": 0.3334190892719321, + "grad_norm": 0.1576143503189087, + "learning_rate": 1.8510882016036658e-05, + "loss": 0.2904, + "step": 162 + }, + { + "epoch": 0.335477231798302, + "grad_norm": 0.1457504779100418, + "learning_rate": 1.8487972508591067e-05, + "loss": 0.2909, + "step": 163 + }, + { + "epoch": 0.337535374324672, + "grad_norm": 0.1557442992925644, + "learning_rate": 1.846506300114548e-05, + "loss": 0.3027, + "step": 164 + }, + { + "epoch": 0.33959351685104194, + "grad_norm": 0.15662318468093872, + "learning_rate": 1.8442153493699888e-05, + "loss": 0.311, + "step": 165 + }, + { + "epoch": 0.3416516593774119, + "grad_norm": 0.16177058219909668, + "learning_rate": 1.8419243986254297e-05, + "loss": 0.2944, + "step": 166 + }, + { + "epoch": 0.34370980190378186, + "grad_norm": 0.16406729817390442, + "learning_rate": 1.8396334478808706e-05, + "loss": 0.2927, + "step": 167 + }, + { + "epoch": 0.3457679444301518, + "grad_norm": 0.16642791032791138, + "learning_rate": 1.8373424971363115e-05, + "loss": 0.3063, + "step": 168 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.1650693714618683, + "learning_rate": 1.8350515463917527e-05, + "loss": 0.2957, + "step": 169 + }, + { + "epoch": 0.3498842294828917, + "grad_norm": 0.15349675714969635, + "learning_rate": 1.8327605956471936e-05, + "loss": 0.297, + "step": 170 + }, + { + "epoch": 0.35194237200926165, + "grad_norm": 0.17770209908485413, + "learning_rate": 1.8304696449026348e-05, + "loss": 0.3011, + "step": 171 + }, + { + "epoch": 0.3540005145356316, + "grad_norm": 0.1647631675004959, + "learning_rate": 1.8281786941580757e-05, + "loss": 0.2962, + "step": 172 + }, + { + "epoch": 0.35605865706200157, + "grad_norm": 0.1603834480047226, + "learning_rate": 1.8258877434135166e-05, + "loss": 0.2937, + "step": 173 + }, + { + "epoch": 0.3581167995883715, + "grad_norm": 0.16780880093574524, + "learning_rate": 1.8235967926689578e-05, + "loss": 0.2997, + "step": 174 + }, + { + "epoch": 0.36017494211474144, + "grad_norm": 0.15976767241954803, + "learning_rate": 1.8213058419243987e-05, + "loss": 0.3043, + "step": 175 + }, + { + "epoch": 0.3622330846411114, + "grad_norm": 0.16236485540866852, + "learning_rate": 1.8190148911798396e-05, + "loss": 0.3069, + "step": 176 + }, + { + "epoch": 0.36429122716748136, + "grad_norm": 0.16391968727111816, + "learning_rate": 1.816723940435281e-05, + "loss": 0.2923, + "step": 177 + }, + { + "epoch": 0.3663493696938513, + "grad_norm": 0.15806889533996582, + "learning_rate": 1.8144329896907217e-05, + "loss": 0.2872, + "step": 178 + }, + { + "epoch": 0.3684075122202212, + "grad_norm": 0.1627352088689804, + "learning_rate": 1.812142038946163e-05, + "loss": 0.3032, + "step": 179 + }, + { + "epoch": 0.3704656547465912, + "grad_norm": 0.15103371441364288, + "learning_rate": 1.809851088201604e-05, + "loss": 0.2847, + "step": 180 + }, + { + "epoch": 0.37252379727296114, + "grad_norm": 0.15178488194942474, + "learning_rate": 1.8075601374570447e-05, + "loss": 0.3017, + "step": 181 + }, + { + "epoch": 0.3745819397993311, + "grad_norm": 0.15493899583816528, + "learning_rate": 1.805269186712486e-05, + "loss": 0.2901, + "step": 182 + }, + { + "epoch": 0.37664008232570106, + "grad_norm": 0.15990686416625977, + "learning_rate": 1.802978235967927e-05, + "loss": 0.2861, + "step": 183 + }, + { + "epoch": 0.378698224852071, + "grad_norm": 0.15824148058891296, + "learning_rate": 1.8006872852233677e-05, + "loss": 0.2885, + "step": 184 + }, + { + "epoch": 0.38075636737844093, + "grad_norm": 0.15690775215625763, + "learning_rate": 1.798396334478809e-05, + "loss": 0.2814, + "step": 185 + }, + { + "epoch": 0.3828145099048109, + "grad_norm": 0.15833796560764313, + "learning_rate": 1.79610538373425e-05, + "loss": 0.2847, + "step": 186 + }, + { + "epoch": 0.38487265243118085, + "grad_norm": 0.16560044884681702, + "learning_rate": 1.793814432989691e-05, + "loss": 0.3061, + "step": 187 + }, + { + "epoch": 0.3869307949575508, + "grad_norm": 0.16240179538726807, + "learning_rate": 1.791523482245132e-05, + "loss": 0.2943, + "step": 188 + }, + { + "epoch": 0.38898893748392077, + "grad_norm": 0.15825721621513367, + "learning_rate": 1.789232531500573e-05, + "loss": 0.2934, + "step": 189 + }, + { + "epoch": 0.3910470800102907, + "grad_norm": 0.16665388643741608, + "learning_rate": 1.786941580756014e-05, + "loss": 0.291, + "step": 190 + }, + { + "epoch": 0.39310522253666064, + "grad_norm": 0.16581200063228607, + "learning_rate": 1.784650630011455e-05, + "loss": 0.2849, + "step": 191 + }, + { + "epoch": 0.3951633650630306, + "grad_norm": 0.1604345291852951, + "learning_rate": 1.782359679266896e-05, + "loss": 0.3, + "step": 192 + }, + { + "epoch": 0.39722150758940056, + "grad_norm": 0.16107915341854095, + "learning_rate": 1.7800687285223368e-05, + "loss": 0.2847, + "step": 193 + }, + { + "epoch": 0.3992796501157705, + "grad_norm": 0.1571730375289917, + "learning_rate": 1.7777777777777777e-05, + "loss": 0.2863, + "step": 194 + }, + { + "epoch": 0.4013377926421405, + "grad_norm": 0.1656399518251419, + "learning_rate": 1.775486827033219e-05, + "loss": 0.2878, + "step": 195 + }, + { + "epoch": 0.4033959351685104, + "grad_norm": 0.16738460958003998, + "learning_rate": 1.7731958762886598e-05, + "loss": 0.286, + "step": 196 + }, + { + "epoch": 0.40545407769488034, + "grad_norm": 0.16704292595386505, + "learning_rate": 1.770904925544101e-05, + "loss": 0.2919, + "step": 197 + }, + { + "epoch": 0.40751222022125033, + "grad_norm": 0.16215579211711884, + "learning_rate": 1.768613974799542e-05, + "loss": 0.2874, + "step": 198 + }, + { + "epoch": 0.40957036274762026, + "grad_norm": 0.15573479235172272, + "learning_rate": 1.7663230240549828e-05, + "loss": 0.2904, + "step": 199 + }, + { + "epoch": 0.4116285052739902, + "grad_norm": 0.1707623153924942, + "learning_rate": 1.764032073310424e-05, + "loss": 0.289, + "step": 200 + }, + { + "epoch": 0.4116285052739902, + "eval_loss": 0.3214050829410553, + "eval_runtime": 2449.7742, + "eval_samples_per_second": 3.173, + "eval_steps_per_second": 0.794, + "step": 200 + }, + { + "epoch": 0.4136866478003602, + "grad_norm": 0.1699172556400299, + "learning_rate": 1.761741122565865e-05, + "loss": 0.2852, + "step": 201 + }, + { + "epoch": 0.4157447903267301, + "grad_norm": 0.19150058925151825, + "learning_rate": 1.7594501718213058e-05, + "loss": 0.29, + "step": 202 + }, + { + "epoch": 0.41780293285310005, + "grad_norm": 0.15794627368450165, + "learning_rate": 1.757159221076747e-05, + "loss": 0.2746, + "step": 203 + }, + { + "epoch": 0.41986107537947004, + "grad_norm": 0.17305190861225128, + "learning_rate": 1.754868270332188e-05, + "loss": 0.3003, + "step": 204 + }, + { + "epoch": 0.42191921790583997, + "grad_norm": 0.16257523000240326, + "learning_rate": 1.752577319587629e-05, + "loss": 0.2789, + "step": 205 + }, + { + "epoch": 0.4239773604322099, + "grad_norm": 0.17273619771003723, + "learning_rate": 1.75028636884307e-05, + "loss": 0.2917, + "step": 206 + }, + { + "epoch": 0.4260355029585799, + "grad_norm": 0.17502790689468384, + "learning_rate": 1.747995418098511e-05, + "loss": 0.2992, + "step": 207 + }, + { + "epoch": 0.4280936454849498, + "grad_norm": 0.16464050114154816, + "learning_rate": 1.745704467353952e-05, + "loss": 0.2873, + "step": 208 + }, + { + "epoch": 0.43015178801131976, + "grad_norm": 0.1681668758392334, + "learning_rate": 1.743413516609393e-05, + "loss": 0.2991, + "step": 209 + }, + { + "epoch": 0.43220993053768975, + "grad_norm": 0.16957956552505493, + "learning_rate": 1.741122565864834e-05, + "loss": 0.2868, + "step": 210 + }, + { + "epoch": 0.4342680730640597, + "grad_norm": 0.15875883400440216, + "learning_rate": 1.738831615120275e-05, + "loss": 0.2946, + "step": 211 + }, + { + "epoch": 0.4363262155904296, + "grad_norm": 0.18127889931201935, + "learning_rate": 1.736540664375716e-05, + "loss": 0.2835, + "step": 212 + }, + { + "epoch": 0.4383843581167996, + "grad_norm": 0.17822811007499695, + "learning_rate": 1.7342497136311573e-05, + "loss": 0.2944, + "step": 213 + }, + { + "epoch": 0.44044250064316953, + "grad_norm": 0.17555806040763855, + "learning_rate": 1.731958762886598e-05, + "loss": 0.3001, + "step": 214 + }, + { + "epoch": 0.44250064316953946, + "grad_norm": 0.18709121644496918, + "learning_rate": 1.729667812142039e-05, + "loss": 0.282, + "step": 215 + }, + { + "epoch": 0.44455878569590945, + "grad_norm": 0.16322475671768188, + "learning_rate": 1.7273768613974803e-05, + "loss": 0.2883, + "step": 216 + }, + { + "epoch": 0.4466169282222794, + "grad_norm": 0.1677054911851883, + "learning_rate": 1.7250859106529212e-05, + "loss": 0.28, + "step": 217 + }, + { + "epoch": 0.4486750707486493, + "grad_norm": 0.15764063596725464, + "learning_rate": 1.722794959908362e-05, + "loss": 0.2768, + "step": 218 + }, + { + "epoch": 0.4507332132750193, + "grad_norm": 0.16166841983795166, + "learning_rate": 1.7205040091638033e-05, + "loss": 0.2868, + "step": 219 + }, + { + "epoch": 0.45279135580138924, + "grad_norm": 0.1799350380897522, + "learning_rate": 1.7182130584192442e-05, + "loss": 0.2891, + "step": 220 + }, + { + "epoch": 0.45484949832775917, + "grad_norm": 0.18119174242019653, + "learning_rate": 1.715922107674685e-05, + "loss": 0.2841, + "step": 221 + }, + { + "epoch": 0.45690764085412916, + "grad_norm": 0.17725548148155212, + "learning_rate": 1.713631156930126e-05, + "loss": 0.3038, + "step": 222 + }, + { + "epoch": 0.4589657833804991, + "grad_norm": 0.1628233790397644, + "learning_rate": 1.7113402061855672e-05, + "loss": 0.2868, + "step": 223 + }, + { + "epoch": 0.461023925906869, + "grad_norm": 0.1745166927576065, + "learning_rate": 1.709049255441008e-05, + "loss": 0.3033, + "step": 224 + }, + { + "epoch": 0.463082068433239, + "grad_norm": 0.17708267271518707, + "learning_rate": 1.706758304696449e-05, + "loss": 0.2842, + "step": 225 + }, + { + "epoch": 0.46514021095960895, + "grad_norm": 0.1738453358411789, + "learning_rate": 1.7044673539518902e-05, + "loss": 0.3005, + "step": 226 + }, + { + "epoch": 0.4671983534859789, + "grad_norm": 0.1706874966621399, + "learning_rate": 1.702176403207331e-05, + "loss": 0.2924, + "step": 227 + }, + { + "epoch": 0.46925649601234887, + "grad_norm": 0.1697423756122589, + "learning_rate": 1.699885452462772e-05, + "loss": 0.2783, + "step": 228 + }, + { + "epoch": 0.4713146385387188, + "grad_norm": 0.1783403754234314, + "learning_rate": 1.6975945017182132e-05, + "loss": 0.2924, + "step": 229 + }, + { + "epoch": 0.47337278106508873, + "grad_norm": 0.17431536316871643, + "learning_rate": 1.695303550973654e-05, + "loss": 0.2792, + "step": 230 + }, + { + "epoch": 0.4754309235914587, + "grad_norm": 0.164026141166687, + "learning_rate": 1.6930126002290953e-05, + "loss": 0.2825, + "step": 231 + }, + { + "epoch": 0.47748906611782865, + "grad_norm": 0.16449657082557678, + "learning_rate": 1.6907216494845362e-05, + "loss": 0.2831, + "step": 232 + }, + { + "epoch": 0.4795472086441986, + "grad_norm": 0.1812741607427597, + "learning_rate": 1.688430698739977e-05, + "loss": 0.2849, + "step": 233 + }, + { + "epoch": 0.4816053511705686, + "grad_norm": 0.18431834876537323, + "learning_rate": 1.6861397479954183e-05, + "loss": 0.2802, + "step": 234 + }, + { + "epoch": 0.4836634936969385, + "grad_norm": 0.18349015712738037, + "learning_rate": 1.6838487972508592e-05, + "loss": 0.2804, + "step": 235 + }, + { + "epoch": 0.48572163622330844, + "grad_norm": 0.1769968420267105, + "learning_rate": 1.6815578465063e-05, + "loss": 0.2777, + "step": 236 + }, + { + "epoch": 0.4877797787496784, + "grad_norm": 0.17207500338554382, + "learning_rate": 1.6792668957617413e-05, + "loss": 0.2883, + "step": 237 + }, + { + "epoch": 0.48983792127604836, + "grad_norm": 0.1729692667722702, + "learning_rate": 1.6769759450171822e-05, + "loss": 0.2784, + "step": 238 + }, + { + "epoch": 0.4918960638024183, + "grad_norm": 0.17234881222248077, + "learning_rate": 1.6746849942726235e-05, + "loss": 0.2816, + "step": 239 + }, + { + "epoch": 0.4939542063287883, + "grad_norm": 0.17132551968097687, + "learning_rate": 1.6723940435280644e-05, + "loss": 0.2812, + "step": 240 + }, + { + "epoch": 0.4960123488551582, + "grad_norm": 0.1752254068851471, + "learning_rate": 1.6701030927835052e-05, + "loss": 0.2799, + "step": 241 + }, + { + "epoch": 0.49807049138152815, + "grad_norm": 0.1768665313720703, + "learning_rate": 1.6678121420389465e-05, + "loss": 0.2966, + "step": 242 + }, + { + "epoch": 0.5001286339078981, + "grad_norm": 0.18139514327049255, + "learning_rate": 1.6655211912943874e-05, + "loss": 0.2816, + "step": 243 + }, + { + "epoch": 0.5021867764342681, + "grad_norm": 0.17312943935394287, + "learning_rate": 1.6632302405498283e-05, + "loss": 0.2845, + "step": 244 + }, + { + "epoch": 0.5042449189606381, + "grad_norm": 0.17966389656066895, + "learning_rate": 1.6609392898052695e-05, + "loss": 0.2864, + "step": 245 + }, + { + "epoch": 0.506303061487008, + "grad_norm": 0.16653811931610107, + "learning_rate": 1.6586483390607104e-05, + "loss": 0.2759, + "step": 246 + }, + { + "epoch": 0.5083612040133779, + "grad_norm": 0.1634613424539566, + "learning_rate": 1.6563573883161516e-05, + "loss": 0.2728, + "step": 247 + }, + { + "epoch": 0.5104193465397479, + "grad_norm": 0.17358507215976715, + "learning_rate": 1.654066437571592e-05, + "loss": 0.2706, + "step": 248 + }, + { + "epoch": 0.5124774890661178, + "grad_norm": 0.17524316906929016, + "learning_rate": 1.6517754868270334e-05, + "loss": 0.2805, + "step": 249 + }, + { + "epoch": 0.5145356315924878, + "grad_norm": 0.18134094774723053, + "learning_rate": 1.6494845360824743e-05, + "loss": 0.2909, + "step": 250 + }, + { + "epoch": 0.5165937741188578, + "grad_norm": 0.17795510590076447, + "learning_rate": 1.647193585337915e-05, + "loss": 0.2889, + "step": 251 + }, + { + "epoch": 0.5186519166452277, + "grad_norm": 0.16782547533512115, + "learning_rate": 1.6449026345933564e-05, + "loss": 0.2842, + "step": 252 + }, + { + "epoch": 0.5207100591715976, + "grad_norm": 0.17360062897205353, + "learning_rate": 1.6426116838487973e-05, + "loss": 0.2763, + "step": 253 + }, + { + "epoch": 0.5227682016979676, + "grad_norm": 0.17241406440734863, + "learning_rate": 1.6403207331042385e-05, + "loss": 0.2753, + "step": 254 + }, + { + "epoch": 0.5248263442243375, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.6380297823596794e-05, + "loss": 0.2732, + "step": 255 + }, + { + "epoch": 0.5268844867507075, + "grad_norm": 0.1807374209165573, + "learning_rate": 1.6357388316151203e-05, + "loss": 0.2856, + "step": 256 + }, + { + "epoch": 0.5289426292770775, + "grad_norm": 0.1749904304742813, + "learning_rate": 1.6334478808705615e-05, + "loss": 0.285, + "step": 257 + }, + { + "epoch": 0.5310007718034474, + "grad_norm": 0.16673170030117035, + "learning_rate": 1.6311569301260024e-05, + "loss": 0.2825, + "step": 258 + }, + { + "epoch": 0.5330589143298173, + "grad_norm": 0.17239685356616974, + "learning_rate": 1.6288659793814433e-05, + "loss": 0.2845, + "step": 259 + }, + { + "epoch": 0.5351170568561873, + "grad_norm": 0.1831504851579666, + "learning_rate": 1.6265750286368845e-05, + "loss": 0.2859, + "step": 260 + }, + { + "epoch": 0.5371751993825572, + "grad_norm": 0.18507827818393707, + "learning_rate": 1.6242840778923254e-05, + "loss": 0.293, + "step": 261 + }, + { + "epoch": 0.5392333419089272, + "grad_norm": 0.16738134622573853, + "learning_rate": 1.6219931271477663e-05, + "loss": 0.2853, + "step": 262 + }, + { + "epoch": 0.5412914844352972, + "grad_norm": 0.1701226830482483, + "learning_rate": 1.6197021764032075e-05, + "loss": 0.2763, + "step": 263 + }, + { + "epoch": 0.5433496269616671, + "grad_norm": 0.18195705115795135, + "learning_rate": 1.6174112256586484e-05, + "loss": 0.2797, + "step": 264 + }, + { + "epoch": 0.545407769488037, + "grad_norm": 0.1832309514284134, + "learning_rate": 1.6151202749140896e-05, + "loss": 0.2885, + "step": 265 + }, + { + "epoch": 0.547465912014407, + "grad_norm": 0.1773810088634491, + "learning_rate": 1.6128293241695305e-05, + "loss": 0.2682, + "step": 266 + }, + { + "epoch": 0.5495240545407769, + "grad_norm": 0.16989603638648987, + "learning_rate": 1.6105383734249714e-05, + "loss": 0.2821, + "step": 267 + }, + { + "epoch": 0.551582197067147, + "grad_norm": 0.17835170030593872, + "learning_rate": 1.6082474226804127e-05, + "loss": 0.2774, + "step": 268 + }, + { + "epoch": 0.5536403395935169, + "grad_norm": 0.1777082234621048, + "learning_rate": 1.6059564719358535e-05, + "loss": 0.2726, + "step": 269 + }, + { + "epoch": 0.5556984821198868, + "grad_norm": 0.18766450881958008, + "learning_rate": 1.6036655211912944e-05, + "loss": 0.2879, + "step": 270 + }, + { + "epoch": 0.5577566246462567, + "grad_norm": 0.1868186593055725, + "learning_rate": 1.6013745704467357e-05, + "loss": 0.2808, + "step": 271 + }, + { + "epoch": 0.5598147671726267, + "grad_norm": 0.16695882380008698, + "learning_rate": 1.5990836197021766e-05, + "loss": 0.2668, + "step": 272 + }, + { + "epoch": 0.5618729096989966, + "grad_norm": 0.17224495112895966, + "learning_rate": 1.5967926689576178e-05, + "loss": 0.2682, + "step": 273 + }, + { + "epoch": 0.5639310522253667, + "grad_norm": 0.20116423070430756, + "learning_rate": 1.5945017182130587e-05, + "loss": 0.276, + "step": 274 + }, + { + "epoch": 0.5659891947517366, + "grad_norm": 0.19478343427181244, + "learning_rate": 1.5922107674684996e-05, + "loss": 0.2854, + "step": 275 + }, + { + "epoch": 0.5680473372781065, + "grad_norm": 0.20242950320243835, + "learning_rate": 1.5899198167239405e-05, + "loss": 0.2854, + "step": 276 + }, + { + "epoch": 0.5701054798044765, + "grad_norm": 0.19146093726158142, + "learning_rate": 1.5876288659793813e-05, + "loss": 0.2817, + "step": 277 + }, + { + "epoch": 0.5721636223308464, + "grad_norm": 0.1804896742105484, + "learning_rate": 1.5853379152348226e-05, + "loss": 0.2714, + "step": 278 + }, + { + "epoch": 0.5742217648572163, + "grad_norm": 0.19315646588802338, + "learning_rate": 1.5830469644902635e-05, + "loss": 0.2703, + "step": 279 + }, + { + "epoch": 0.5762799073835864, + "grad_norm": 0.1910266876220703, + "learning_rate": 1.5807560137457047e-05, + "loss": 0.2728, + "step": 280 + }, + { + "epoch": 0.5783380499099563, + "grad_norm": 0.20330773293972015, + "learning_rate": 1.5784650630011456e-05, + "loss": 0.2717, + "step": 281 + }, + { + "epoch": 0.5803961924363262, + "grad_norm": 0.19080683588981628, + "learning_rate": 1.5761741122565865e-05, + "loss": 0.2679, + "step": 282 + }, + { + "epoch": 0.5824543349626962, + "grad_norm": 0.18052135407924652, + "learning_rate": 1.5738831615120277e-05, + "loss": 0.2815, + "step": 283 + }, + { + "epoch": 0.5845124774890661, + "grad_norm": 0.1998361051082611, + "learning_rate": 1.5715922107674686e-05, + "loss": 0.2888, + "step": 284 + }, + { + "epoch": 0.586570620015436, + "grad_norm": 0.1978764683008194, + "learning_rate": 1.5693012600229095e-05, + "loss": 0.2926, + "step": 285 + }, + { + "epoch": 0.5886287625418061, + "grad_norm": 0.17189203202724457, + "learning_rate": 1.5670103092783507e-05, + "loss": 0.2674, + "step": 286 + }, + { + "epoch": 0.590686905068176, + "grad_norm": 0.1937166303396225, + "learning_rate": 1.5647193585337916e-05, + "loss": 0.2838, + "step": 287 + }, + { + "epoch": 0.5927450475945459, + "grad_norm": 0.18978627026081085, + "learning_rate": 1.5624284077892328e-05, + "loss": 0.273, + "step": 288 + }, + { + "epoch": 0.5948031901209159, + "grad_norm": 0.17718705534934998, + "learning_rate": 1.5601374570446737e-05, + "loss": 0.2842, + "step": 289 + }, + { + "epoch": 0.5968613326472858, + "grad_norm": 0.1912536770105362, + "learning_rate": 1.5578465063001146e-05, + "loss": 0.2736, + "step": 290 + }, + { + "epoch": 0.5989194751736557, + "grad_norm": 0.18104907870292664, + "learning_rate": 1.555555555555556e-05, + "loss": 0.274, + "step": 291 + }, + { + "epoch": 0.6009776177000258, + "grad_norm": 0.1620381772518158, + "learning_rate": 1.5532646048109967e-05, + "loss": 0.2663, + "step": 292 + }, + { + "epoch": 0.6030357602263957, + "grad_norm": 0.17973916232585907, + "learning_rate": 1.5509736540664376e-05, + "loss": 0.2791, + "step": 293 + }, + { + "epoch": 0.6050939027527656, + "grad_norm": 0.16821186244487762, + "learning_rate": 1.548682703321879e-05, + "loss": 0.2787, + "step": 294 + }, + { + "epoch": 0.6071520452791356, + "grad_norm": 0.18426693975925446, + "learning_rate": 1.5463917525773197e-05, + "loss": 0.2886, + "step": 295 + }, + { + "epoch": 0.6092101878055055, + "grad_norm": 0.19796033203601837, + "learning_rate": 1.5441008018327606e-05, + "loss": 0.268, + "step": 296 + }, + { + "epoch": 0.6112683303318754, + "grad_norm": 0.1971343755722046, + "learning_rate": 1.541809851088202e-05, + "loss": 0.2761, + "step": 297 + }, + { + "epoch": 0.6133264728582455, + "grad_norm": 0.17458567023277283, + "learning_rate": 1.5395189003436427e-05, + "loss": 0.2831, + "step": 298 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.17610400915145874, + "learning_rate": 1.537227949599084e-05, + "loss": 0.2691, + "step": 299 + }, + { + "epoch": 0.6174427579109854, + "grad_norm": 0.1929042488336563, + "learning_rate": 1.534936998854525e-05, + "loss": 0.2847, + "step": 300 + }, + { + "epoch": 0.6174427579109854, + "eval_loss": 0.2959522604942322, + "eval_runtime": 2428.6339, + "eval_samples_per_second": 3.201, + "eval_steps_per_second": 0.8, + "step": 300 + }, + { + "epoch": 0.6195009004373553, + "grad_norm": 0.19430233538150787, + "learning_rate": 1.5326460481099657e-05, + "loss": 0.279, + "step": 301 + }, + { + "epoch": 0.6215590429637252, + "grad_norm": 0.18542642891407013, + "learning_rate": 1.5303550973654066e-05, + "loss": 0.2695, + "step": 302 + }, + { + "epoch": 0.6236171854900951, + "grad_norm": 0.1850169450044632, + "learning_rate": 1.5280641466208475e-05, + "loss": 0.2847, + "step": 303 + }, + { + "epoch": 0.6256753280164652, + "grad_norm": 0.18449267745018005, + "learning_rate": 1.5257731958762888e-05, + "loss": 0.2804, + "step": 304 + }, + { + "epoch": 0.6277334705428351, + "grad_norm": 0.18608458340168, + "learning_rate": 1.5234822451317296e-05, + "loss": 0.2792, + "step": 305 + }, + { + "epoch": 0.6297916130692051, + "grad_norm": 0.21136076748371124, + "learning_rate": 1.5211912943871707e-05, + "loss": 0.2829, + "step": 306 + }, + { + "epoch": 0.631849755595575, + "grad_norm": 0.19672206044197083, + "learning_rate": 1.5189003436426118e-05, + "loss": 0.2854, + "step": 307 + }, + { + "epoch": 0.6339078981219449, + "grad_norm": 0.1834034025669098, + "learning_rate": 1.5166093928980528e-05, + "loss": 0.2775, + "step": 308 + }, + { + "epoch": 0.6359660406483149, + "grad_norm": 0.18414819240570068, + "learning_rate": 1.5143184421534937e-05, + "loss": 0.2794, + "step": 309 + }, + { + "epoch": 0.6380241831746849, + "grad_norm": 0.1890152245759964, + "learning_rate": 1.5120274914089348e-05, + "loss": 0.2718, + "step": 310 + }, + { + "epoch": 0.6400823257010548, + "grad_norm": 0.18923887610435486, + "learning_rate": 1.5097365406643758e-05, + "loss": 0.2795, + "step": 311 + }, + { + "epoch": 0.6421404682274248, + "grad_norm": 0.20047079026699066, + "learning_rate": 1.5074455899198169e-05, + "loss": 0.2811, + "step": 312 + }, + { + "epoch": 0.6441986107537947, + "grad_norm": 0.1910201609134674, + "learning_rate": 1.5051546391752578e-05, + "loss": 0.2732, + "step": 313 + }, + { + "epoch": 0.6462567532801646, + "grad_norm": 0.2021956443786621, + "learning_rate": 1.5028636884306988e-05, + "loss": 0.2806, + "step": 314 + }, + { + "epoch": 0.6483148958065346, + "grad_norm": 0.18957914412021637, + "learning_rate": 1.5005727376861399e-05, + "loss": 0.2681, + "step": 315 + }, + { + "epoch": 0.6503730383329046, + "grad_norm": 0.19858811795711517, + "learning_rate": 1.498281786941581e-05, + "loss": 0.2805, + "step": 316 + }, + { + "epoch": 0.6524311808592745, + "grad_norm": 0.1731935292482376, + "learning_rate": 1.4959908361970218e-05, + "loss": 0.2646, + "step": 317 + }, + { + "epoch": 0.6544893233856445, + "grad_norm": 0.19619058072566986, + "learning_rate": 1.4936998854524629e-05, + "loss": 0.2965, + "step": 318 + }, + { + "epoch": 0.6565474659120144, + "grad_norm": 0.18745696544647217, + "learning_rate": 1.491408934707904e-05, + "loss": 0.2766, + "step": 319 + }, + { + "epoch": 0.6586056084383843, + "grad_norm": 0.18006449937820435, + "learning_rate": 1.489117983963345e-05, + "loss": 0.2788, + "step": 320 + }, + { + "epoch": 0.6606637509647543, + "grad_norm": 0.17593689262866974, + "learning_rate": 1.486827033218786e-05, + "loss": 0.2813, + "step": 321 + }, + { + "epoch": 0.6627218934911243, + "grad_norm": 0.18695640563964844, + "learning_rate": 1.484536082474227e-05, + "loss": 0.281, + "step": 322 + }, + { + "epoch": 0.6647800360174942, + "grad_norm": 0.17909488081932068, + "learning_rate": 1.482245131729668e-05, + "loss": 0.2814, + "step": 323 + }, + { + "epoch": 0.6668381785438642, + "grad_norm": 0.19074076414108276, + "learning_rate": 1.4799541809851091e-05, + "loss": 0.2721, + "step": 324 + }, + { + "epoch": 0.6688963210702341, + "grad_norm": 0.19175754487514496, + "learning_rate": 1.47766323024055e-05, + "loss": 0.2754, + "step": 325 + }, + { + "epoch": 0.670954463596604, + "grad_norm": 0.18646575510501862, + "learning_rate": 1.475372279495991e-05, + "loss": 0.2678, + "step": 326 + }, + { + "epoch": 0.673012606122974, + "grad_norm": 0.18553243577480316, + "learning_rate": 1.4730813287514321e-05, + "loss": 0.281, + "step": 327 + }, + { + "epoch": 0.675070748649344, + "grad_norm": 0.17120976746082306, + "learning_rate": 1.470790378006873e-05, + "loss": 0.2691, + "step": 328 + }, + { + "epoch": 0.677128891175714, + "grad_norm": 0.19170524179935455, + "learning_rate": 1.4684994272623139e-05, + "loss": 0.2685, + "step": 329 + }, + { + "epoch": 0.6791870337020839, + "grad_norm": 0.1851339191198349, + "learning_rate": 1.466208476517755e-05, + "loss": 0.266, + "step": 330 + }, + { + "epoch": 0.6812451762284538, + "grad_norm": 0.1678062081336975, + "learning_rate": 1.4639175257731958e-05, + "loss": 0.2609, + "step": 331 + }, + { + "epoch": 0.6833033187548238, + "grad_norm": 0.17913252115249634, + "learning_rate": 1.4616265750286369e-05, + "loss": 0.2716, + "step": 332 + }, + { + "epoch": 0.6853614612811937, + "grad_norm": 0.1859239637851715, + "learning_rate": 1.459335624284078e-05, + "loss": 0.2712, + "step": 333 + }, + { + "epoch": 0.6874196038075637, + "grad_norm": 0.18390226364135742, + "learning_rate": 1.457044673539519e-05, + "loss": 0.2827, + "step": 334 + }, + { + "epoch": 0.6894777463339337, + "grad_norm": 0.18520398437976837, + "learning_rate": 1.4547537227949599e-05, + "loss": 0.2721, + "step": 335 + }, + { + "epoch": 0.6915358888603036, + "grad_norm": 0.18416717648506165, + "learning_rate": 1.452462772050401e-05, + "loss": 0.2683, + "step": 336 + }, + { + "epoch": 0.6935940313866735, + "grad_norm": 0.18727894127368927, + "learning_rate": 1.450171821305842e-05, + "loss": 0.2733, + "step": 337 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.18597093224525452, + "learning_rate": 1.447880870561283e-05, + "loss": 0.2708, + "step": 338 + }, + { + "epoch": 0.6977103164394134, + "grad_norm": 0.1786068081855774, + "learning_rate": 1.445589919816724e-05, + "loss": 0.2667, + "step": 339 + }, + { + "epoch": 0.6997684589657834, + "grad_norm": 0.17466600239276886, + "learning_rate": 1.443298969072165e-05, + "loss": 0.2786, + "step": 340 + }, + { + "epoch": 0.7018266014921534, + "grad_norm": 0.185857355594635, + "learning_rate": 1.4410080183276061e-05, + "loss": 0.2759, + "step": 341 + }, + { + "epoch": 0.7038847440185233, + "grad_norm": 0.2004527747631073, + "learning_rate": 1.4387170675830471e-05, + "loss": 0.2847, + "step": 342 + }, + { + "epoch": 0.7059428865448932, + "grad_norm": 0.18774060904979706, + "learning_rate": 1.436426116838488e-05, + "loss": 0.2766, + "step": 343 + }, + { + "epoch": 0.7080010290712632, + "grad_norm": 0.1840328425168991, + "learning_rate": 1.4341351660939291e-05, + "loss": 0.2722, + "step": 344 + }, + { + "epoch": 0.7100591715976331, + "grad_norm": 0.19089624285697937, + "learning_rate": 1.4318442153493702e-05, + "loss": 0.2779, + "step": 345 + }, + { + "epoch": 0.7121173141240031, + "grad_norm": 0.1848018616437912, + "learning_rate": 1.4295532646048112e-05, + "loss": 0.2739, + "step": 346 + }, + { + "epoch": 0.7141754566503731, + "grad_norm": 0.18844038248062134, + "learning_rate": 1.4272623138602521e-05, + "loss": 0.27, + "step": 347 + }, + { + "epoch": 0.716233599176743, + "grad_norm": 0.19289302825927734, + "learning_rate": 1.4249713631156932e-05, + "loss": 0.2743, + "step": 348 + }, + { + "epoch": 0.7182917417031129, + "grad_norm": 0.18738920986652374, + "learning_rate": 1.4226804123711342e-05, + "loss": 0.2657, + "step": 349 + }, + { + "epoch": 0.7203498842294829, + "grad_norm": 0.1925181746482849, + "learning_rate": 1.4203894616265753e-05, + "loss": 0.2637, + "step": 350 + }, + { + "epoch": 0.7224080267558528, + "grad_norm": 0.19114750623703003, + "learning_rate": 1.4180985108820162e-05, + "loss": 0.2758, + "step": 351 + }, + { + "epoch": 0.7244661692822228, + "grad_norm": 0.18310120701789856, + "learning_rate": 1.4158075601374572e-05, + "loss": 0.2777, + "step": 352 + }, + { + "epoch": 0.7265243118085928, + "grad_norm": 0.2045605331659317, + "learning_rate": 1.4135166093928983e-05, + "loss": 0.2653, + "step": 353 + }, + { + "epoch": 0.7285824543349627, + "grad_norm": 0.1856454759836197, + "learning_rate": 1.4112256586483393e-05, + "loss": 0.267, + "step": 354 + }, + { + "epoch": 0.7306405968613326, + "grad_norm": 0.1855366826057434, + "learning_rate": 1.4089347079037802e-05, + "loss": 0.2805, + "step": 355 + }, + { + "epoch": 0.7326987393877026, + "grad_norm": 0.17913414537906647, + "learning_rate": 1.4066437571592213e-05, + "loss": 0.2755, + "step": 356 + }, + { + "epoch": 0.7347568819140725, + "grad_norm": 0.2057684361934662, + "learning_rate": 1.404352806414662e-05, + "loss": 0.2668, + "step": 357 + }, + { + "epoch": 0.7368150244404424, + "grad_norm": 0.190156951546669, + "learning_rate": 1.402061855670103e-05, + "loss": 0.2778, + "step": 358 + }, + { + "epoch": 0.7388731669668125, + "grad_norm": 0.19387219846248627, + "learning_rate": 1.3997709049255441e-05, + "loss": 0.2785, + "step": 359 + }, + { + "epoch": 0.7409313094931824, + "grad_norm": 0.1933836042881012, + "learning_rate": 1.3974799541809852e-05, + "loss": 0.2661, + "step": 360 + }, + { + "epoch": 0.7429894520195524, + "grad_norm": 0.19618812203407288, + "learning_rate": 1.3951890034364261e-05, + "loss": 0.2622, + "step": 361 + }, + { + "epoch": 0.7450475945459223, + "grad_norm": 0.18786942958831787, + "learning_rate": 1.3928980526918671e-05, + "loss": 0.2695, + "step": 362 + }, + { + "epoch": 0.7471057370722922, + "grad_norm": 0.19361330568790436, + "learning_rate": 1.3906071019473082e-05, + "loss": 0.2869, + "step": 363 + }, + { + "epoch": 0.7491638795986622, + "grad_norm": 0.19813291728496552, + "learning_rate": 1.3883161512027493e-05, + "loss": 0.2753, + "step": 364 + }, + { + "epoch": 0.7512220221250322, + "grad_norm": 0.1891734004020691, + "learning_rate": 1.3860252004581902e-05, + "loss": 0.2694, + "step": 365 + }, + { + "epoch": 0.7532801646514021, + "grad_norm": 0.18902742862701416, + "learning_rate": 1.3837342497136312e-05, + "loss": 0.2675, + "step": 366 + }, + { + "epoch": 0.7553383071777721, + "grad_norm": 0.19838480651378632, + "learning_rate": 1.3814432989690723e-05, + "loss": 0.2721, + "step": 367 + }, + { + "epoch": 0.757396449704142, + "grad_norm": 0.20880939066410065, + "learning_rate": 1.3791523482245133e-05, + "loss": 0.2641, + "step": 368 + }, + { + "epoch": 0.7594545922305119, + "grad_norm": 0.20068003237247467, + "learning_rate": 1.3768613974799542e-05, + "loss": 0.2945, + "step": 369 + }, + { + "epoch": 0.7615127347568819, + "grad_norm": 0.19780132174491882, + "learning_rate": 1.3745704467353953e-05, + "loss": 0.2687, + "step": 370 + }, + { + "epoch": 0.7635708772832519, + "grad_norm": 0.19194689393043518, + "learning_rate": 1.3722794959908363e-05, + "loss": 0.2731, + "step": 371 + }, + { + "epoch": 0.7656290198096218, + "grad_norm": 0.19504573941230774, + "learning_rate": 1.3699885452462774e-05, + "loss": 0.2551, + "step": 372 + }, + { + "epoch": 0.7676871623359918, + "grad_norm": 0.18304413557052612, + "learning_rate": 1.3676975945017183e-05, + "loss": 0.2692, + "step": 373 + }, + { + "epoch": 0.7697453048623617, + "grad_norm": 0.2051483392715454, + "learning_rate": 1.3654066437571593e-05, + "loss": 0.2791, + "step": 374 + }, + { + "epoch": 0.7718034473887316, + "grad_norm": 0.18748973309993744, + "learning_rate": 1.3631156930126004e-05, + "loss": 0.2671, + "step": 375 + }, + { + "epoch": 0.7738615899151016, + "grad_norm": 0.19167177379131317, + "learning_rate": 1.3608247422680415e-05, + "loss": 0.2766, + "step": 376 + }, + { + "epoch": 0.7759197324414716, + "grad_norm": 0.17931750416755676, + "learning_rate": 1.3585337915234824e-05, + "loss": 0.2748, + "step": 377 + }, + { + "epoch": 0.7779778749678415, + "grad_norm": 0.19437509775161743, + "learning_rate": 1.3562428407789234e-05, + "loss": 0.2667, + "step": 378 + }, + { + "epoch": 0.7800360174942115, + "grad_norm": 0.19813868403434753, + "learning_rate": 1.3539518900343645e-05, + "loss": 0.2771, + "step": 379 + }, + { + "epoch": 0.7820941600205814, + "grad_norm": 0.19205260276794434, + "learning_rate": 1.3516609392898055e-05, + "loss": 0.2703, + "step": 380 + }, + { + "epoch": 0.7841523025469513, + "grad_norm": 0.19039763510227203, + "learning_rate": 1.3493699885452464e-05, + "loss": 0.264, + "step": 381 + }, + { + "epoch": 0.7862104450733213, + "grad_norm": 0.18269500136375427, + "learning_rate": 1.3470790378006875e-05, + "loss": 0.2653, + "step": 382 + }, + { + "epoch": 0.7882685875996913, + "grad_norm": 0.1922067403793335, + "learning_rate": 1.3447880870561285e-05, + "loss": 0.2754, + "step": 383 + }, + { + "epoch": 0.7903267301260612, + "grad_norm": 0.19615666568279266, + "learning_rate": 1.3424971363115693e-05, + "loss": 0.2811, + "step": 384 + }, + { + "epoch": 0.7923848726524312, + "grad_norm": 0.19037973880767822, + "learning_rate": 1.3402061855670103e-05, + "loss": 0.2673, + "step": 385 + }, + { + "epoch": 0.7944430151788011, + "grad_norm": 0.191124826669693, + "learning_rate": 1.3379152348224514e-05, + "loss": 0.2683, + "step": 386 + }, + { + "epoch": 0.796501157705171, + "grad_norm": 0.18429923057556152, + "learning_rate": 1.3356242840778923e-05, + "loss": 0.2698, + "step": 387 + }, + { + "epoch": 0.798559300231541, + "grad_norm": 0.1839045137166977, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.2895, + "step": 388 + }, + { + "epoch": 0.800617442757911, + "grad_norm": 0.1944131702184677, + "learning_rate": 1.3310423825887744e-05, + "loss": 0.2641, + "step": 389 + }, + { + "epoch": 0.802675585284281, + "grad_norm": 0.20407740771770477, + "learning_rate": 1.3287514318442154e-05, + "loss": 0.2743, + "step": 390 + }, + { + "epoch": 0.8047337278106509, + "grad_norm": 0.1814037561416626, + "learning_rate": 1.3264604810996563e-05, + "loss": 0.2672, + "step": 391 + }, + { + "epoch": 0.8067918703370208, + "grad_norm": 0.1886950582265854, + "learning_rate": 1.3241695303550974e-05, + "loss": 0.2725, + "step": 392 + }, + { + "epoch": 0.8088500128633908, + "grad_norm": 0.19429941475391388, + "learning_rate": 1.3218785796105385e-05, + "loss": 0.2669, + "step": 393 + }, + { + "epoch": 0.8109081553897607, + "grad_norm": 0.19143058359622955, + "learning_rate": 1.3195876288659795e-05, + "loss": 0.2659, + "step": 394 + }, + { + "epoch": 0.8129662979161307, + "grad_norm": 0.2213468849658966, + "learning_rate": 1.3172966781214204e-05, + "loss": 0.2764, + "step": 395 + }, + { + "epoch": 0.8150244404425007, + "grad_norm": 0.2040800005197525, + "learning_rate": 1.3150057273768615e-05, + "loss": 0.2783, + "step": 396 + }, + { + "epoch": 0.8170825829688706, + "grad_norm": 0.1948375254869461, + "learning_rate": 1.3127147766323025e-05, + "loss": 0.2689, + "step": 397 + }, + { + "epoch": 0.8191407254952405, + "grad_norm": 0.1915021538734436, + "learning_rate": 1.3104238258877436e-05, + "loss": 0.2808, + "step": 398 + }, + { + "epoch": 0.8211988680216105, + "grad_norm": 0.19760248064994812, + "learning_rate": 1.3081328751431845e-05, + "loss": 0.2712, + "step": 399 + }, + { + "epoch": 0.8232570105479804, + "grad_norm": 0.2082677185535431, + "learning_rate": 1.3058419243986255e-05, + "loss": 0.2707, + "step": 400 + }, + { + "epoch": 0.8232570105479804, + "eval_loss": 0.28778496384620667, + "eval_runtime": 2427.9096, + "eval_samples_per_second": 3.202, + "eval_steps_per_second": 0.801, + "step": 400 + }, + { + "epoch": 0.8253151530743504, + "grad_norm": 0.19694332778453827, + "learning_rate": 1.3035509736540666e-05, + "loss": 0.2801, + "step": 401 + }, + { + "epoch": 0.8273732956007204, + "grad_norm": 0.19448824226856232, + "learning_rate": 1.3012600229095077e-05, + "loss": 0.2632, + "step": 402 + }, + { + "epoch": 0.8294314381270903, + "grad_norm": 0.18745476007461548, + "learning_rate": 1.2989690721649485e-05, + "loss": 0.2773, + "step": 403 + }, + { + "epoch": 0.8314895806534602, + "grad_norm": 0.19524575769901276, + "learning_rate": 1.2966781214203896e-05, + "loss": 0.2594, + "step": 404 + }, + { + "epoch": 0.8335477231798302, + "grad_norm": 0.19612252712249756, + "learning_rate": 1.2943871706758307e-05, + "loss": 0.271, + "step": 405 + }, + { + "epoch": 0.8356058657062001, + "grad_norm": 0.19964493811130524, + "learning_rate": 1.2920962199312717e-05, + "loss": 0.2615, + "step": 406 + }, + { + "epoch": 0.8376640082325701, + "grad_norm": 0.20115099847316742, + "learning_rate": 1.2898052691867126e-05, + "loss": 0.269, + "step": 407 + }, + { + "epoch": 0.8397221507589401, + "grad_norm": 0.18949687480926514, + "learning_rate": 1.2875143184421537e-05, + "loss": 0.2649, + "step": 408 + }, + { + "epoch": 0.84178029328531, + "grad_norm": 0.1931927353143692, + "learning_rate": 1.2852233676975947e-05, + "loss": 0.2611, + "step": 409 + }, + { + "epoch": 0.8438384358116799, + "grad_norm": 0.18723614513874054, + "learning_rate": 1.2829324169530358e-05, + "loss": 0.2699, + "step": 410 + }, + { + "epoch": 0.8458965783380499, + "grad_norm": 0.19405977427959442, + "learning_rate": 1.2806414662084765e-05, + "loss": 0.2691, + "step": 411 + }, + { + "epoch": 0.8479547208644198, + "grad_norm": 0.2021879404783249, + "learning_rate": 1.2783505154639176e-05, + "loss": 0.267, + "step": 412 + }, + { + "epoch": 0.8500128633907899, + "grad_norm": 0.20015574991703033, + "learning_rate": 1.2760595647193586e-05, + "loss": 0.2632, + "step": 413 + }, + { + "epoch": 0.8520710059171598, + "grad_norm": 0.19090059399604797, + "learning_rate": 1.2737686139747995e-05, + "loss": 0.2743, + "step": 414 + }, + { + "epoch": 0.8541291484435297, + "grad_norm": 0.1906920224428177, + "learning_rate": 1.2714776632302406e-05, + "loss": 0.2723, + "step": 415 + }, + { + "epoch": 0.8561872909698997, + "grad_norm": 0.19348129630088806, + "learning_rate": 1.2691867124856816e-05, + "loss": 0.2656, + "step": 416 + }, + { + "epoch": 0.8582454334962696, + "grad_norm": 0.18771213293075562, + "learning_rate": 1.2668957617411227e-05, + "loss": 0.2617, + "step": 417 + }, + { + "epoch": 0.8603035760226395, + "grad_norm": 0.2135135382413864, + "learning_rate": 1.2646048109965636e-05, + "loss": 0.2773, + "step": 418 + }, + { + "epoch": 0.8623617185490096, + "grad_norm": 0.19689443707466125, + "learning_rate": 1.2623138602520046e-05, + "loss": 0.2623, + "step": 419 + }, + { + "epoch": 0.8644198610753795, + "grad_norm": 0.18752440810203552, + "learning_rate": 1.2600229095074457e-05, + "loss": 0.2599, + "step": 420 + }, + { + "epoch": 0.8664780036017494, + "grad_norm": 0.19264395534992218, + "learning_rate": 1.2577319587628866e-05, + "loss": 0.2707, + "step": 421 + }, + { + "epoch": 0.8685361461281194, + "grad_norm": 0.19980797171592712, + "learning_rate": 1.2554410080183277e-05, + "loss": 0.2616, + "step": 422 + }, + { + "epoch": 0.8705942886544893, + "grad_norm": 0.22940242290496826, + "learning_rate": 1.2531500572737687e-05, + "loss": 0.2712, + "step": 423 + }, + { + "epoch": 0.8726524311808592, + "grad_norm": 0.18825359642505646, + "learning_rate": 1.2508591065292098e-05, + "loss": 0.2779, + "step": 424 + }, + { + "epoch": 0.8747105737072293, + "grad_norm": 0.21553562581539154, + "learning_rate": 1.2485681557846507e-05, + "loss": 0.2677, + "step": 425 + }, + { + "epoch": 0.8767687162335992, + "grad_norm": 0.2025568038225174, + "learning_rate": 1.2462772050400917e-05, + "loss": 0.2659, + "step": 426 + }, + { + "epoch": 0.8788268587599691, + "grad_norm": 0.19179950654506683, + "learning_rate": 1.2439862542955328e-05, + "loss": 0.2762, + "step": 427 + }, + { + "epoch": 0.8808850012863391, + "grad_norm": 0.20982210338115692, + "learning_rate": 1.2416953035509738e-05, + "loss": 0.2648, + "step": 428 + }, + { + "epoch": 0.882943143812709, + "grad_norm": 0.2084280252456665, + "learning_rate": 1.2394043528064147e-05, + "loss": 0.2806, + "step": 429 + }, + { + "epoch": 0.8850012863390789, + "grad_norm": 0.1993308663368225, + "learning_rate": 1.2371134020618558e-05, + "loss": 0.2673, + "step": 430 + }, + { + "epoch": 0.887059428865449, + "grad_norm": 0.1917535811662674, + "learning_rate": 1.2348224513172968e-05, + "loss": 0.2596, + "step": 431 + }, + { + "epoch": 0.8891175713918189, + "grad_norm": 0.18980742990970612, + "learning_rate": 1.2325315005727379e-05, + "loss": 0.2607, + "step": 432 + }, + { + "epoch": 0.8911757139181888, + "grad_norm": 0.21062685549259186, + "learning_rate": 1.2302405498281788e-05, + "loss": 0.2612, + "step": 433 + }, + { + "epoch": 0.8932338564445588, + "grad_norm": 0.20591405034065247, + "learning_rate": 1.2279495990836199e-05, + "loss": 0.2698, + "step": 434 + }, + { + "epoch": 0.8952919989709287, + "grad_norm": 0.2052398920059204, + "learning_rate": 1.2256586483390609e-05, + "loss": 0.2673, + "step": 435 + }, + { + "epoch": 0.8973501414972986, + "grad_norm": 0.19963452219963074, + "learning_rate": 1.223367697594502e-05, + "loss": 0.266, + "step": 436 + }, + { + "epoch": 0.8994082840236687, + "grad_norm": 0.1929163783788681, + "learning_rate": 1.2210767468499429e-05, + "loss": 0.2605, + "step": 437 + }, + { + "epoch": 0.9014664265500386, + "grad_norm": 0.19121681153774261, + "learning_rate": 1.218785796105384e-05, + "loss": 0.2642, + "step": 438 + }, + { + "epoch": 0.9035245690764085, + "grad_norm": 0.18931221961975098, + "learning_rate": 1.2164948453608248e-05, + "loss": 0.2653, + "step": 439 + }, + { + "epoch": 0.9055827116027785, + "grad_norm": 0.21359370648860931, + "learning_rate": 1.2142038946162657e-05, + "loss": 0.264, + "step": 440 + }, + { + "epoch": 0.9076408541291484, + "grad_norm": 0.1874193251132965, + "learning_rate": 1.2119129438717068e-05, + "loss": 0.2664, + "step": 441 + }, + { + "epoch": 0.9096989966555183, + "grad_norm": 0.19697226583957672, + "learning_rate": 1.2096219931271478e-05, + "loss": 0.2651, + "step": 442 + }, + { + "epoch": 0.9117571391818884, + "grad_norm": 0.20930957794189453, + "learning_rate": 1.2073310423825889e-05, + "loss": 0.2724, + "step": 443 + }, + { + "epoch": 0.9138152817082583, + "grad_norm": 0.19588977098464966, + "learning_rate": 1.2050400916380298e-05, + "loss": 0.2648, + "step": 444 + }, + { + "epoch": 0.9158734242346283, + "grad_norm": 0.19452017545700073, + "learning_rate": 1.2027491408934708e-05, + "loss": 0.2808, + "step": 445 + }, + { + "epoch": 0.9179315667609982, + "grad_norm": 0.19226408004760742, + "learning_rate": 1.2004581901489119e-05, + "loss": 0.2627, + "step": 446 + }, + { + "epoch": 0.9199897092873681, + "grad_norm": 0.18108274042606354, + "learning_rate": 1.198167239404353e-05, + "loss": 0.2693, + "step": 447 + }, + { + "epoch": 0.922047851813738, + "grad_norm": 0.19352363049983978, + "learning_rate": 1.1958762886597938e-05, + "loss": 0.2705, + "step": 448 + }, + { + "epoch": 0.9241059943401081, + "grad_norm": 0.18535122275352478, + "learning_rate": 1.1935853379152349e-05, + "loss": 0.2608, + "step": 449 + }, + { + "epoch": 0.926164136866478, + "grad_norm": 0.19209617376327515, + "learning_rate": 1.191294387170676e-05, + "loss": 0.2702, + "step": 450 + }, + { + "epoch": 0.928222279392848, + "grad_norm": 0.1866796910762787, + "learning_rate": 1.189003436426117e-05, + "loss": 0.264, + "step": 451 + }, + { + "epoch": 0.9302804219192179, + "grad_norm": 0.21708665788173676, + "learning_rate": 1.1867124856815579e-05, + "loss": 0.2693, + "step": 452 + }, + { + "epoch": 0.9323385644455878, + "grad_norm": 0.19297796487808228, + "learning_rate": 1.184421534936999e-05, + "loss": 0.2745, + "step": 453 + }, + { + "epoch": 0.9343967069719578, + "grad_norm": 0.19070400297641754, + "learning_rate": 1.18213058419244e-05, + "loss": 0.265, + "step": 454 + }, + { + "epoch": 0.9364548494983278, + "grad_norm": 0.19821566343307495, + "learning_rate": 1.1798396334478809e-05, + "loss": 0.2674, + "step": 455 + }, + { + "epoch": 0.9385129920246977, + "grad_norm": 0.2032192200422287, + "learning_rate": 1.177548682703322e-05, + "loss": 0.276, + "step": 456 + }, + { + "epoch": 0.9405711345510677, + "grad_norm": 0.19127750396728516, + "learning_rate": 1.175257731958763e-05, + "loss": 0.2696, + "step": 457 + }, + { + "epoch": 0.9426292770774376, + "grad_norm": 0.19187286496162415, + "learning_rate": 1.1729667812142041e-05, + "loss": 0.2601, + "step": 458 + }, + { + "epoch": 0.9446874196038075, + "grad_norm": 0.20871371030807495, + "learning_rate": 1.170675830469645e-05, + "loss": 0.2687, + "step": 459 + }, + { + "epoch": 0.9467455621301775, + "grad_norm": 0.19228306412696838, + "learning_rate": 1.168384879725086e-05, + "loss": 0.2633, + "step": 460 + }, + { + "epoch": 0.9488037046565475, + "grad_norm": 0.19025444984436035, + "learning_rate": 1.1660939289805271e-05, + "loss": 0.2721, + "step": 461 + }, + { + "epoch": 0.9508618471829174, + "grad_norm": 0.19476914405822754, + "learning_rate": 1.1638029782359682e-05, + "loss": 0.2662, + "step": 462 + }, + { + "epoch": 0.9529199897092874, + "grad_norm": 0.1991666853427887, + "learning_rate": 1.161512027491409e-05, + "loss": 0.269, + "step": 463 + }, + { + "epoch": 0.9549781322356573, + "grad_norm": 0.19385920464992523, + "learning_rate": 1.1592210767468501e-05, + "loss": 0.2647, + "step": 464 + }, + { + "epoch": 0.9570362747620272, + "grad_norm": 0.1911603957414627, + "learning_rate": 1.1569301260022912e-05, + "loss": 0.2679, + "step": 465 + }, + { + "epoch": 0.9590944172883972, + "grad_norm": 0.20373377203941345, + "learning_rate": 1.1546391752577319e-05, + "loss": 0.2694, + "step": 466 + }, + { + "epoch": 0.9611525598147672, + "grad_norm": 0.20550350844860077, + "learning_rate": 1.152348224513173e-05, + "loss": 0.2677, + "step": 467 + }, + { + "epoch": 0.9632107023411371, + "grad_norm": 0.2049354463815689, + "learning_rate": 1.150057273768614e-05, + "loss": 0.2752, + "step": 468 + }, + { + "epoch": 0.9652688448675071, + "grad_norm": 0.21691595017910004, + "learning_rate": 1.147766323024055e-05, + "loss": 0.2727, + "step": 469 + }, + { + "epoch": 0.967326987393877, + "grad_norm": 0.20727306604385376, + "learning_rate": 1.145475372279496e-05, + "loss": 0.2575, + "step": 470 + }, + { + "epoch": 0.969385129920247, + "grad_norm": 0.19166423380374908, + "learning_rate": 1.143184421534937e-05, + "loss": 0.2716, + "step": 471 + }, + { + "epoch": 0.9714432724466169, + "grad_norm": 0.18833886086940765, + "learning_rate": 1.140893470790378e-05, + "loss": 0.2651, + "step": 472 + }, + { + "epoch": 0.9735014149729869, + "grad_norm": 0.19680088758468628, + "learning_rate": 1.1386025200458191e-05, + "loss": 0.2621, + "step": 473 + }, + { + "epoch": 0.9755595574993569, + "grad_norm": 0.20966476202011108, + "learning_rate": 1.13631156930126e-05, + "loss": 0.2725, + "step": 474 + }, + { + "epoch": 0.9776177000257268, + "grad_norm": 0.1963450163602829, + "learning_rate": 1.134020618556701e-05, + "loss": 0.2569, + "step": 475 + }, + { + "epoch": 0.9796758425520967, + "grad_norm": 0.21289944648742676, + "learning_rate": 1.1317296678121421e-05, + "loss": 0.2622, + "step": 476 + }, + { + "epoch": 0.9817339850784667, + "grad_norm": 0.2103341966867447, + "learning_rate": 1.1294387170675832e-05, + "loss": 0.2803, + "step": 477 + }, + { + "epoch": 0.9837921276048366, + "grad_norm": 0.20202945172786713, + "learning_rate": 1.1271477663230241e-05, + "loss": 0.273, + "step": 478 + }, + { + "epoch": 0.9858502701312066, + "grad_norm": 0.18241006135940552, + "learning_rate": 1.1248568155784651e-05, + "loss": 0.2721, + "step": 479 + }, + { + "epoch": 0.9879084126575766, + "grad_norm": 0.19221259653568268, + "learning_rate": 1.1225658648339062e-05, + "loss": 0.2646, + "step": 480 + }, + { + "epoch": 0.9899665551839465, + "grad_norm": 0.19371837377548218, + "learning_rate": 1.1202749140893473e-05, + "loss": 0.2519, + "step": 481 + }, + { + "epoch": 0.9920246977103164, + "grad_norm": 0.1972094029188156, + "learning_rate": 1.1179839633447882e-05, + "loss": 0.2555, + "step": 482 + }, + { + "epoch": 0.9940828402366864, + "grad_norm": 0.19414126873016357, + "learning_rate": 1.1156930126002292e-05, + "loss": 0.2726, + "step": 483 + }, + { + "epoch": 0.9961409827630563, + "grad_norm": 0.18993492424488068, + "learning_rate": 1.1134020618556703e-05, + "loss": 0.2644, + "step": 484 + }, + { + "epoch": 0.9981991252894263, + "grad_norm": 0.19713927805423737, + "learning_rate": 1.1111111111111113e-05, + "loss": 0.2569, + "step": 485 + }, + { + "epoch": 1.00205814252637, + "grad_norm": 0.3423589766025543, + "learning_rate": 1.1088201603665522e-05, + "loss": 0.5285, + "step": 486 + }, + { + "epoch": 1.0041162850527399, + "grad_norm": 0.1901763528585434, + "learning_rate": 1.1065292096219933e-05, + "loss": 0.2621, + "step": 487 + }, + { + "epoch": 1.0061744275791098, + "grad_norm": 0.20508776605129242, + "learning_rate": 1.1042382588774343e-05, + "loss": 0.2665, + "step": 488 + }, + { + "epoch": 1.0082325701054797, + "grad_norm": 0.20188146829605103, + "learning_rate": 1.1019473081328752e-05, + "loss": 0.2547, + "step": 489 + }, + { + "epoch": 1.0102907126318497, + "grad_norm": 0.20245613157749176, + "learning_rate": 1.0996563573883163e-05, + "loss": 0.2657, + "step": 490 + }, + { + "epoch": 1.0123488551582196, + "grad_norm": 0.19711382687091827, + "learning_rate": 1.0973654066437574e-05, + "loss": 0.2597, + "step": 491 + }, + { + "epoch": 1.0144069976845898, + "grad_norm": 0.21538953483104706, + "learning_rate": 1.0950744558991984e-05, + "loss": 0.2727, + "step": 492 + }, + { + "epoch": 1.0164651402109597, + "grad_norm": 0.20296984910964966, + "learning_rate": 1.0927835051546391e-05, + "loss": 0.2634, + "step": 493 + }, + { + "epoch": 1.0185232827373296, + "grad_norm": 0.20134592056274414, + "learning_rate": 1.0904925544100802e-05, + "loss": 0.2596, + "step": 494 + }, + { + "epoch": 1.0205814252636995, + "grad_norm": 0.200101837515831, + "learning_rate": 1.0882016036655212e-05, + "loss": 0.2575, + "step": 495 + }, + { + "epoch": 1.0226395677900695, + "grad_norm": 0.19144928455352783, + "learning_rate": 1.0859106529209621e-05, + "loss": 0.263, + "step": 496 + }, + { + "epoch": 1.0246977103164394, + "grad_norm": 0.19832482933998108, + "learning_rate": 1.0836197021764032e-05, + "loss": 0.2656, + "step": 497 + }, + { + "epoch": 1.0267558528428093, + "grad_norm": 0.20965202152729034, + "learning_rate": 1.0813287514318443e-05, + "loss": 0.2611, + "step": 498 + }, + { + "epoch": 1.0288139953691793, + "grad_norm": 0.1974337100982666, + "learning_rate": 1.0790378006872853e-05, + "loss": 0.2667, + "step": 499 + }, + { + "epoch": 1.0308721378955492, + "grad_norm": 0.20611713826656342, + "learning_rate": 1.0767468499427262e-05, + "loss": 0.2674, + "step": 500 + }, + { + "epoch": 1.0308721378955492, + "eval_loss": 0.2836935222148895, + "eval_runtime": 2423.44, + "eval_samples_per_second": 3.208, + "eval_steps_per_second": 0.802, + "step": 500 + }, + { + "epoch": 1.0329302804219191, + "grad_norm": 0.202958345413208, + "learning_rate": 1.0744558991981673e-05, + "loss": 0.2684, + "step": 501 + }, + { + "epoch": 1.034988422948289, + "grad_norm": 0.1984429508447647, + "learning_rate": 1.0721649484536083e-05, + "loss": 0.2557, + "step": 502 + }, + { + "epoch": 1.0370465654746592, + "grad_norm": 0.19396482408046722, + "learning_rate": 1.0698739977090494e-05, + "loss": 0.255, + "step": 503 + }, + { + "epoch": 1.0391047080010292, + "grad_norm": 0.19176840782165527, + "learning_rate": 1.0675830469644903e-05, + "loss": 0.2675, + "step": 504 + }, + { + "epoch": 1.041162850527399, + "grad_norm": 0.20167966187000275, + "learning_rate": 1.0652920962199313e-05, + "loss": 0.2669, + "step": 505 + }, + { + "epoch": 1.043220993053769, + "grad_norm": 0.2049783617258072, + "learning_rate": 1.0630011454753724e-05, + "loss": 0.2446, + "step": 506 + }, + { + "epoch": 1.045279135580139, + "grad_norm": 0.19293472170829773, + "learning_rate": 1.0607101947308135e-05, + "loss": 0.256, + "step": 507 + }, + { + "epoch": 1.047337278106509, + "grad_norm": 0.19432370364665985, + "learning_rate": 1.0584192439862543e-05, + "loss": 0.2605, + "step": 508 + }, + { + "epoch": 1.0493954206328788, + "grad_norm": 0.19784876704216003, + "learning_rate": 1.0561282932416954e-05, + "loss": 0.2617, + "step": 509 + }, + { + "epoch": 1.0514535631592488, + "grad_norm": 0.19982090592384338, + "learning_rate": 1.0538373424971365e-05, + "loss": 0.264, + "step": 510 + }, + { + "epoch": 1.0535117056856187, + "grad_norm": 0.2019587755203247, + "learning_rate": 1.0515463917525775e-05, + "loss": 0.2543, + "step": 511 + }, + { + "epoch": 1.0555698482119886, + "grad_norm": 0.19848807156085968, + "learning_rate": 1.0492554410080184e-05, + "loss": 0.2613, + "step": 512 + }, + { + "epoch": 1.0576279907383586, + "grad_norm": 0.20360374450683594, + "learning_rate": 1.0469644902634595e-05, + "loss": 0.2675, + "step": 513 + }, + { + "epoch": 1.0596861332647285, + "grad_norm": 0.19209840893745422, + "learning_rate": 1.0446735395189005e-05, + "loss": 0.2517, + "step": 514 + }, + { + "epoch": 1.0617442757910984, + "grad_norm": 0.19142381846904755, + "learning_rate": 1.0423825887743416e-05, + "loss": 0.2631, + "step": 515 + }, + { + "epoch": 1.0638024183174686, + "grad_norm": 0.20222575962543488, + "learning_rate": 1.0400916380297825e-05, + "loss": 0.2625, + "step": 516 + }, + { + "epoch": 1.0658605608438385, + "grad_norm": 0.1984448879957199, + "learning_rate": 1.0378006872852235e-05, + "loss": 0.2584, + "step": 517 + }, + { + "epoch": 1.0679187033702084, + "grad_norm": 0.1992885023355484, + "learning_rate": 1.0355097365406646e-05, + "loss": 0.2609, + "step": 518 + }, + { + "epoch": 1.0699768458965784, + "grad_norm": 0.20708978176116943, + "learning_rate": 1.0332187857961057e-05, + "loss": 0.2618, + "step": 519 + }, + { + "epoch": 1.0720349884229483, + "grad_norm": 0.22806766629219055, + "learning_rate": 1.0309278350515464e-05, + "loss": 0.2634, + "step": 520 + }, + { + "epoch": 1.0740931309493182, + "grad_norm": 0.2019941806793213, + "learning_rate": 1.0286368843069874e-05, + "loss": 0.2588, + "step": 521 + }, + { + "epoch": 1.0761512734756882, + "grad_norm": 0.19460470974445343, + "learning_rate": 1.0263459335624283e-05, + "loss": 0.2692, + "step": 522 + }, + { + "epoch": 1.078209416002058, + "grad_norm": 0.19483187794685364, + "learning_rate": 1.0240549828178694e-05, + "loss": 0.2474, + "step": 523 + }, + { + "epoch": 1.080267558528428, + "grad_norm": 0.2199576050043106, + "learning_rate": 1.0217640320733104e-05, + "loss": 0.2582, + "step": 524 + }, + { + "epoch": 1.082325701054798, + "grad_norm": 0.20485302805900574, + "learning_rate": 1.0194730813287515e-05, + "loss": 0.2463, + "step": 525 + }, + { + "epoch": 1.084383843581168, + "grad_norm": 0.20773454010486603, + "learning_rate": 1.0171821305841924e-05, + "loss": 0.2501, + "step": 526 + }, + { + "epoch": 1.086441986107538, + "grad_norm": 0.19593262672424316, + "learning_rate": 1.0148911798396335e-05, + "loss": 0.2608, + "step": 527 + }, + { + "epoch": 1.088500128633908, + "grad_norm": 0.20500554144382477, + "learning_rate": 1.0126002290950745e-05, + "loss": 0.2586, + "step": 528 + }, + { + "epoch": 1.090558271160278, + "grad_norm": 0.19919747114181519, + "learning_rate": 1.0103092783505156e-05, + "loss": 0.2724, + "step": 529 + }, + { + "epoch": 1.0926164136866479, + "grad_norm": 0.1953326314687729, + "learning_rate": 1.0080183276059565e-05, + "loss": 0.2456, + "step": 530 + }, + { + "epoch": 1.0946745562130178, + "grad_norm": 0.2155047059059143, + "learning_rate": 1.0057273768613975e-05, + "loss": 0.2644, + "step": 531 + }, + { + "epoch": 1.0967326987393877, + "grad_norm": 0.19747495651245117, + "learning_rate": 1.0034364261168386e-05, + "loss": 0.2539, + "step": 532 + }, + { + "epoch": 1.0987908412657577, + "grad_norm": 0.20261652767658234, + "learning_rate": 1.0011454753722796e-05, + "loss": 0.255, + "step": 533 + }, + { + "epoch": 1.1008489837921276, + "grad_norm": 0.19529719650745392, + "learning_rate": 9.988545246277205e-06, + "loss": 0.2489, + "step": 534 + }, + { + "epoch": 1.1029071263184975, + "grad_norm": 0.20239490270614624, + "learning_rate": 9.965635738831616e-06, + "loss": 0.2664, + "step": 535 + }, + { + "epoch": 1.1049652688448675, + "grad_norm": 0.19377024471759796, + "learning_rate": 9.942726231386026e-06, + "loss": 0.2615, + "step": 536 + }, + { + "epoch": 1.1070234113712374, + "grad_norm": 0.20523156225681305, + "learning_rate": 9.919816723940437e-06, + "loss": 0.2548, + "step": 537 + }, + { + "epoch": 1.1090815538976073, + "grad_norm": 0.2046228051185608, + "learning_rate": 9.896907216494846e-06, + "loss": 0.2704, + "step": 538 + }, + { + "epoch": 1.1111396964239773, + "grad_norm": 0.21209484338760376, + "learning_rate": 9.873997709049257e-06, + "loss": 0.2637, + "step": 539 + }, + { + "epoch": 1.1131978389503474, + "grad_norm": 0.20251420140266418, + "learning_rate": 9.851088201603667e-06, + "loss": 0.2617, + "step": 540 + }, + { + "epoch": 1.1152559814767173, + "grad_norm": 0.21695846319198608, + "learning_rate": 9.828178694158076e-06, + "loss": 0.2658, + "step": 541 + }, + { + "epoch": 1.1173141240030873, + "grad_norm": 0.2015303075313568, + "learning_rate": 9.805269186712487e-06, + "loss": 0.2528, + "step": 542 + }, + { + "epoch": 1.1193722665294572, + "grad_norm": 0.21796390414237976, + "learning_rate": 9.782359679266896e-06, + "loss": 0.2625, + "step": 543 + }, + { + "epoch": 1.1214304090558271, + "grad_norm": 0.20676304399967194, + "learning_rate": 9.759450171821306e-06, + "loss": 0.268, + "step": 544 + }, + { + "epoch": 1.123488551582197, + "grad_norm": 0.1986500769853592, + "learning_rate": 9.736540664375717e-06, + "loss": 0.2546, + "step": 545 + }, + { + "epoch": 1.125546694108567, + "grad_norm": 0.20008589327335358, + "learning_rate": 9.713631156930127e-06, + "loss": 0.2525, + "step": 546 + }, + { + "epoch": 1.127604836634937, + "grad_norm": 0.1891598105430603, + "learning_rate": 9.690721649484536e-06, + "loss": 0.256, + "step": 547 + }, + { + "epoch": 1.1296629791613069, + "grad_norm": 0.20968230068683624, + "learning_rate": 9.667812142038947e-06, + "loss": 0.2495, + "step": 548 + }, + { + "epoch": 1.1317211216876768, + "grad_norm": 0.2025834023952484, + "learning_rate": 9.644902634593357e-06, + "loss": 0.2533, + "step": 549 + }, + { + "epoch": 1.1337792642140467, + "grad_norm": 0.21087367832660675, + "learning_rate": 9.621993127147768e-06, + "loss": 0.2518, + "step": 550 + }, + { + "epoch": 1.1358374067404169, + "grad_norm": 0.20784996449947357, + "learning_rate": 9.599083619702177e-06, + "loss": 0.2594, + "step": 551 + }, + { + "epoch": 1.1378955492667868, + "grad_norm": 0.20754118263721466, + "learning_rate": 9.576174112256587e-06, + "loss": 0.2515, + "step": 552 + }, + { + "epoch": 1.1399536917931568, + "grad_norm": 0.225090891122818, + "learning_rate": 9.553264604810998e-06, + "loss": 0.2615, + "step": 553 + }, + { + "epoch": 1.1420118343195267, + "grad_norm": 0.24656590819358826, + "learning_rate": 9.530355097365407e-06, + "loss": 0.2636, + "step": 554 + }, + { + "epoch": 1.1440699768458966, + "grad_norm": 0.22454337775707245, + "learning_rate": 9.507445589919818e-06, + "loss": 0.2584, + "step": 555 + }, + { + "epoch": 1.1461281193722666, + "grad_norm": 0.2229425013065338, + "learning_rate": 9.484536082474226e-06, + "loss": 0.2543, + "step": 556 + }, + { + "epoch": 1.1481862618986365, + "grad_norm": 0.18805071711540222, + "learning_rate": 9.461626575028637e-06, + "loss": 0.2593, + "step": 557 + }, + { + "epoch": 1.1502444044250064, + "grad_norm": 0.23163346946239471, + "learning_rate": 9.438717067583048e-06, + "loss": 0.2537, + "step": 558 + }, + { + "epoch": 1.1523025469513763, + "grad_norm": 0.2126983255147934, + "learning_rate": 9.415807560137458e-06, + "loss": 0.2598, + "step": 559 + }, + { + "epoch": 1.1543606894777463, + "grad_norm": 0.2113332748413086, + "learning_rate": 9.392898052691867e-06, + "loss": 0.2617, + "step": 560 + }, + { + "epoch": 1.1564188320041162, + "grad_norm": 0.2220505177974701, + "learning_rate": 9.369988545246278e-06, + "loss": 0.2673, + "step": 561 + }, + { + "epoch": 1.1584769745304861, + "grad_norm": 0.21683354675769806, + "learning_rate": 9.347079037800688e-06, + "loss": 0.259, + "step": 562 + }, + { + "epoch": 1.160535117056856, + "grad_norm": 0.20226940512657166, + "learning_rate": 9.324169530355099e-06, + "loss": 0.2536, + "step": 563 + }, + { + "epoch": 1.1625932595832262, + "grad_norm": 0.2166106402873993, + "learning_rate": 9.301260022909508e-06, + "loss": 0.2573, + "step": 564 + }, + { + "epoch": 1.1646514021095962, + "grad_norm": 0.21802830696105957, + "learning_rate": 9.278350515463918e-06, + "loss": 0.2604, + "step": 565 + }, + { + "epoch": 1.166709544635966, + "grad_norm": 0.19723279774188995, + "learning_rate": 9.255441008018329e-06, + "loss": 0.2643, + "step": 566 + }, + { + "epoch": 1.168767687162336, + "grad_norm": 0.20100893080234528, + "learning_rate": 9.23253150057274e-06, + "loss": 0.2601, + "step": 567 + }, + { + "epoch": 1.170825829688706, + "grad_norm": 0.19834032654762268, + "learning_rate": 9.209621993127148e-06, + "loss": 0.2624, + "step": 568 + }, + { + "epoch": 1.172883972215076, + "grad_norm": 0.20677493512630463, + "learning_rate": 9.186712485681557e-06, + "loss": 0.2527, + "step": 569 + }, + { + "epoch": 1.1749421147414458, + "grad_norm": 0.20895297825336456, + "learning_rate": 9.163802978235968e-06, + "loss": 0.2519, + "step": 570 + }, + { + "epoch": 1.1770002572678158, + "grad_norm": 0.19748030602931976, + "learning_rate": 9.140893470790379e-06, + "loss": 0.2567, + "step": 571 + }, + { + "epoch": 1.1790583997941857, + "grad_norm": 0.20713521540164948, + "learning_rate": 9.117983963344789e-06, + "loss": 0.2771, + "step": 572 + }, + { + "epoch": 1.1811165423205556, + "grad_norm": 0.2146754264831543, + "learning_rate": 9.095074455899198e-06, + "loss": 0.2537, + "step": 573 + }, + { + "epoch": 1.1831746848469256, + "grad_norm": 0.20723004639148712, + "learning_rate": 9.072164948453609e-06, + "loss": 0.253, + "step": 574 + }, + { + "epoch": 1.1852328273732957, + "grad_norm": 0.2072172611951828, + "learning_rate": 9.04925544100802e-06, + "loss": 0.2545, + "step": 575 + }, + { + "epoch": 1.1872909698996654, + "grad_norm": 0.20537281036376953, + "learning_rate": 9.02634593356243e-06, + "loss": 0.2517, + "step": 576 + }, + { + "epoch": 1.1893491124260356, + "grad_norm": 0.21034401655197144, + "learning_rate": 9.003436426116839e-06, + "loss": 0.2506, + "step": 577 + }, + { + "epoch": 1.1914072549524055, + "grad_norm": 0.21373845636844635, + "learning_rate": 8.98052691867125e-06, + "loss": 0.2544, + "step": 578 + }, + { + "epoch": 1.1934653974787754, + "grad_norm": 0.22282572090625763, + "learning_rate": 8.95761741122566e-06, + "loss": 0.2607, + "step": 579 + }, + { + "epoch": 1.1955235400051454, + "grad_norm": 0.20421402156352997, + "learning_rate": 8.93470790378007e-06, + "loss": 0.2636, + "step": 580 + }, + { + "epoch": 1.1975816825315153, + "grad_norm": 0.2095903605222702, + "learning_rate": 8.91179839633448e-06, + "loss": 0.2627, + "step": 581 + }, + { + "epoch": 1.1996398250578852, + "grad_norm": 0.2215132862329483, + "learning_rate": 8.888888888888888e-06, + "loss": 0.2651, + "step": 582 + }, + { + "epoch": 1.2016979675842552, + "grad_norm": 0.22536343336105347, + "learning_rate": 8.865979381443299e-06, + "loss": 0.2548, + "step": 583 + }, + { + "epoch": 1.203756110110625, + "grad_norm": 0.19969668984413147, + "learning_rate": 8.84306987399771e-06, + "loss": 0.2646, + "step": 584 + }, + { + "epoch": 1.205814252636995, + "grad_norm": 0.225993350148201, + "learning_rate": 8.82016036655212e-06, + "loss": 0.2607, + "step": 585 + }, + { + "epoch": 1.207872395163365, + "grad_norm": 0.19197311997413635, + "learning_rate": 8.797250859106529e-06, + "loss": 0.2519, + "step": 586 + }, + { + "epoch": 1.209930537689735, + "grad_norm": 0.1974429190158844, + "learning_rate": 8.77434135166094e-06, + "loss": 0.2512, + "step": 587 + }, + { + "epoch": 1.211988680216105, + "grad_norm": 0.19816122949123383, + "learning_rate": 8.75143184421535e-06, + "loss": 0.2582, + "step": 588 + }, + { + "epoch": 1.214046822742475, + "grad_norm": 0.20259711146354675, + "learning_rate": 8.72852233676976e-06, + "loss": 0.2561, + "step": 589 + }, + { + "epoch": 1.216104965268845, + "grad_norm": 0.23857274651527405, + "learning_rate": 8.70561282932417e-06, + "loss": 0.2574, + "step": 590 + }, + { + "epoch": 1.2181631077952149, + "grad_norm": 0.2108597606420517, + "learning_rate": 8.68270332187858e-06, + "loss": 0.2546, + "step": 591 + }, + { + "epoch": 1.2202212503215848, + "grad_norm": 0.20933857560157776, + "learning_rate": 8.65979381443299e-06, + "loss": 0.2527, + "step": 592 + }, + { + "epoch": 1.2222793928479547, + "grad_norm": 0.19276075065135956, + "learning_rate": 8.636884306987401e-06, + "loss": 0.26, + "step": 593 + }, + { + "epoch": 1.2243375353743247, + "grad_norm": 0.2111658900976181, + "learning_rate": 8.61397479954181e-06, + "loss": 0.267, + "step": 594 + }, + { + "epoch": 1.2263956779006946, + "grad_norm": 0.20039953291416168, + "learning_rate": 8.591065292096221e-06, + "loss": 0.2454, + "step": 595 + }, + { + "epoch": 1.2284538204270645, + "grad_norm": 0.212934210896492, + "learning_rate": 8.56815578465063e-06, + "loss": 0.2674, + "step": 596 + }, + { + "epoch": 1.2305119629534345, + "grad_norm": 0.2036072462797165, + "learning_rate": 8.54524627720504e-06, + "loss": 0.2613, + "step": 597 + }, + { + "epoch": 1.2325701054798044, + "grad_norm": 0.20735019445419312, + "learning_rate": 8.522336769759451e-06, + "loss": 0.2648, + "step": 598 + }, + { + "epoch": 1.2346282480061745, + "grad_norm": 0.2097824215888977, + "learning_rate": 8.49942726231386e-06, + "loss": 0.2535, + "step": 599 + }, + { + "epoch": 1.2366863905325443, + "grad_norm": 0.19988034665584564, + "learning_rate": 8.47651775486827e-06, + "loss": 0.2507, + "step": 600 + }, + { + "epoch": 1.2366863905325443, + "eval_loss": 0.28046268224716187, + "eval_runtime": 2441.2385, + "eval_samples_per_second": 3.184, + "eval_steps_per_second": 0.796, + "step": 600 + }, + { + "epoch": 1.2387445330589144, + "grad_norm": 0.20321473479270935, + "learning_rate": 8.453608247422681e-06, + "loss": 0.2588, + "step": 601 + }, + { + "epoch": 1.2408026755852843, + "grad_norm": 0.20362116396427155, + "learning_rate": 8.430698739977092e-06, + "loss": 0.2608, + "step": 602 + }, + { + "epoch": 1.2428608181116543, + "grad_norm": 0.20123381912708282, + "learning_rate": 8.4077892325315e-06, + "loss": 0.2527, + "step": 603 + }, + { + "epoch": 1.2449189606380242, + "grad_norm": 0.2133895605802536, + "learning_rate": 8.384879725085911e-06, + "loss": 0.2731, + "step": 604 + }, + { + "epoch": 1.2469771031643941, + "grad_norm": 0.5265193581581116, + "learning_rate": 8.361970217640322e-06, + "loss": 0.2498, + "step": 605 + }, + { + "epoch": 1.249035245690764, + "grad_norm": 0.2142847776412964, + "learning_rate": 8.339060710194732e-06, + "loss": 0.268, + "step": 606 + }, + { + "epoch": 1.251093388217134, + "grad_norm": 0.19556185603141785, + "learning_rate": 8.316151202749141e-06, + "loss": 0.2587, + "step": 607 + }, + { + "epoch": 1.253151530743504, + "grad_norm": 0.20104384422302246, + "learning_rate": 8.293241695303552e-06, + "loss": 0.248, + "step": 608 + }, + { + "epoch": 1.2552096732698739, + "grad_norm": 0.20386339724063873, + "learning_rate": 8.27033218785796e-06, + "loss": 0.2564, + "step": 609 + }, + { + "epoch": 1.257267815796244, + "grad_norm": 0.21464361250400543, + "learning_rate": 8.247422680412371e-06, + "loss": 0.2651, + "step": 610 + }, + { + "epoch": 1.2593259583226137, + "grad_norm": 0.20295380055904388, + "learning_rate": 8.224513172966782e-06, + "loss": 0.249, + "step": 611 + }, + { + "epoch": 1.261384100848984, + "grad_norm": 0.19431617856025696, + "learning_rate": 8.201603665521193e-06, + "loss": 0.2487, + "step": 612 + }, + { + "epoch": 1.2634422433753538, + "grad_norm": 0.20218072831630707, + "learning_rate": 8.178694158075601e-06, + "loss": 0.2609, + "step": 613 + }, + { + "epoch": 1.2655003859017238, + "grad_norm": 0.20500090718269348, + "learning_rate": 8.155784650630012e-06, + "loss": 0.2705, + "step": 614 + }, + { + "epoch": 1.2675585284280937, + "grad_norm": 0.20803052186965942, + "learning_rate": 8.132875143184423e-06, + "loss": 0.2525, + "step": 615 + }, + { + "epoch": 1.2696166709544636, + "grad_norm": 0.2087874561548233, + "learning_rate": 8.109965635738832e-06, + "loss": 0.2541, + "step": 616 + }, + { + "epoch": 1.2716748134808336, + "grad_norm": 0.2055324912071228, + "learning_rate": 8.087056128293242e-06, + "loss": 0.2647, + "step": 617 + }, + { + "epoch": 1.2737329560072035, + "grad_norm": 0.20352068543434143, + "learning_rate": 8.064146620847653e-06, + "loss": 0.2666, + "step": 618 + }, + { + "epoch": 1.2757910985335734, + "grad_norm": 0.20651914179325104, + "learning_rate": 8.041237113402063e-06, + "loss": 0.2525, + "step": 619 + }, + { + "epoch": 1.2778492410599434, + "grad_norm": 0.2097817212343216, + "learning_rate": 8.018327605956472e-06, + "loss": 0.2576, + "step": 620 + }, + { + "epoch": 1.2799073835863133, + "grad_norm": 0.20695503056049347, + "learning_rate": 7.995418098510883e-06, + "loss": 0.2633, + "step": 621 + }, + { + "epoch": 1.2819655261126832, + "grad_norm": 0.20550110936164856, + "learning_rate": 7.972508591065293e-06, + "loss": 0.2629, + "step": 622 + }, + { + "epoch": 1.2840236686390534, + "grad_norm": 0.2035083919763565, + "learning_rate": 7.949599083619702e-06, + "loss": 0.2566, + "step": 623 + }, + { + "epoch": 1.286081811165423, + "grad_norm": 0.21426044404506683, + "learning_rate": 7.926689576174113e-06, + "loss": 0.2636, + "step": 624 + }, + { + "epoch": 1.2881399536917932, + "grad_norm": 0.20519520342350006, + "learning_rate": 7.903780068728523e-06, + "loss": 0.2665, + "step": 625 + }, + { + "epoch": 1.2901980962181632, + "grad_norm": 0.2012549638748169, + "learning_rate": 7.880870561282932e-06, + "loss": 0.2588, + "step": 626 + }, + { + "epoch": 1.292256238744533, + "grad_norm": 0.19951675832271576, + "learning_rate": 7.857961053837343e-06, + "loss": 0.2592, + "step": 627 + }, + { + "epoch": 1.294314381270903, + "grad_norm": 0.21163856983184814, + "learning_rate": 7.835051546391754e-06, + "loss": 0.26, + "step": 628 + }, + { + "epoch": 1.296372523797273, + "grad_norm": 0.21543577313423157, + "learning_rate": 7.812142038946164e-06, + "loss": 0.2486, + "step": 629 + }, + { + "epoch": 1.298430666323643, + "grad_norm": 0.20984649658203125, + "learning_rate": 7.789232531500573e-06, + "loss": 0.2603, + "step": 630 + }, + { + "epoch": 1.3004888088500128, + "grad_norm": 0.20047229528427124, + "learning_rate": 7.766323024054984e-06, + "loss": 0.2559, + "step": 631 + }, + { + "epoch": 1.3025469513763828, + "grad_norm": 0.21747010946273804, + "learning_rate": 7.743413516609394e-06, + "loss": 0.2563, + "step": 632 + }, + { + "epoch": 1.3046050939027527, + "grad_norm": 0.20818108320236206, + "learning_rate": 7.720504009163803e-06, + "loss": 0.2507, + "step": 633 + }, + { + "epoch": 1.3066632364291229, + "grad_norm": 0.19827309250831604, + "learning_rate": 7.697594501718214e-06, + "loss": 0.2578, + "step": 634 + }, + { + "epoch": 1.3087213789554926, + "grad_norm": 0.2122543305158615, + "learning_rate": 7.674684994272624e-06, + "loss": 0.2633, + "step": 635 + }, + { + "epoch": 1.3107795214818627, + "grad_norm": 0.20870576798915863, + "learning_rate": 7.651775486827033e-06, + "loss": 0.2616, + "step": 636 + }, + { + "epoch": 1.3128376640082327, + "grad_norm": 0.2069362998008728, + "learning_rate": 7.628865979381444e-06, + "loss": 0.2426, + "step": 637 + }, + { + "epoch": 1.3148958065346026, + "grad_norm": 0.19999894499778748, + "learning_rate": 7.6059564719358535e-06, + "loss": 0.2547, + "step": 638 + }, + { + "epoch": 1.3169539490609725, + "grad_norm": 0.20518334209918976, + "learning_rate": 7.583046964490264e-06, + "loss": 0.2571, + "step": 639 + }, + { + "epoch": 1.3190120915873425, + "grad_norm": 0.20558986067771912, + "learning_rate": 7.560137457044674e-06, + "loss": 0.2483, + "step": 640 + }, + { + "epoch": 1.3210702341137124, + "grad_norm": 0.21443884074687958, + "learning_rate": 7.5372279495990845e-06, + "loss": 0.2494, + "step": 641 + }, + { + "epoch": 1.3231283766400823, + "grad_norm": 0.2025483101606369, + "learning_rate": 7.514318442153494e-06, + "loss": 0.2473, + "step": 642 + }, + { + "epoch": 1.3251865191664522, + "grad_norm": 0.21094976365566254, + "learning_rate": 7.491408934707905e-06, + "loss": 0.2603, + "step": 643 + }, + { + "epoch": 1.3272446616928222, + "grad_norm": 0.2047881782054901, + "learning_rate": 7.4684994272623145e-06, + "loss": 0.2601, + "step": 644 + }, + { + "epoch": 1.3293028042191921, + "grad_norm": 0.2075866013765335, + "learning_rate": 7.445589919816725e-06, + "loss": 0.2644, + "step": 645 + }, + { + "epoch": 1.331360946745562, + "grad_norm": 0.2174414098262787, + "learning_rate": 7.422680412371135e-06, + "loss": 0.2609, + "step": 646 + }, + { + "epoch": 1.3334190892719322, + "grad_norm": 0.20820266008377075, + "learning_rate": 7.3997709049255455e-06, + "loss": 0.2535, + "step": 647 + }, + { + "epoch": 1.335477231798302, + "grad_norm": 0.20941515266895294, + "learning_rate": 7.376861397479955e-06, + "loss": 0.2578, + "step": 648 + }, + { + "epoch": 1.337535374324672, + "grad_norm": 0.2027975171804428, + "learning_rate": 7.353951890034365e-06, + "loss": 0.2573, + "step": 649 + }, + { + "epoch": 1.339593516851042, + "grad_norm": 0.209550142288208, + "learning_rate": 7.331042382588775e-06, + "loss": 0.2513, + "step": 650 + }, + { + "epoch": 1.341651659377412, + "grad_norm": 0.21425557136535645, + "learning_rate": 7.3081328751431845e-06, + "loss": 0.2568, + "step": 651 + }, + { + "epoch": 1.3437098019037819, + "grad_norm": 0.22760476171970367, + "learning_rate": 7.285223367697595e-06, + "loss": 0.2549, + "step": 652 + }, + { + "epoch": 1.3457679444301518, + "grad_norm": 0.21329441666603088, + "learning_rate": 7.262313860252005e-06, + "loss": 0.2467, + "step": 653 + }, + { + "epoch": 1.3478260869565217, + "grad_norm": 0.20949490368366241, + "learning_rate": 7.239404352806415e-06, + "loss": 0.2569, + "step": 654 + }, + { + "epoch": 1.3498842294828917, + "grad_norm": 0.21022753417491913, + "learning_rate": 7.216494845360825e-06, + "loss": 0.2644, + "step": 655 + }, + { + "epoch": 1.3519423720092616, + "grad_norm": 0.20240676403045654, + "learning_rate": 7.193585337915236e-06, + "loss": 0.2561, + "step": 656 + }, + { + "epoch": 1.3540005145356315, + "grad_norm": 0.19892892241477966, + "learning_rate": 7.1706758304696455e-06, + "loss": 0.2564, + "step": 657 + }, + { + "epoch": 1.3560586570620017, + "grad_norm": 0.22104541957378387, + "learning_rate": 7.147766323024056e-06, + "loss": 0.2466, + "step": 658 + }, + { + "epoch": 1.3581167995883714, + "grad_norm": 0.2074560970067978, + "learning_rate": 7.124856815578466e-06, + "loss": 0.2634, + "step": 659 + }, + { + "epoch": 1.3601749421147415, + "grad_norm": 0.20596396923065186, + "learning_rate": 7.101947308132876e-06, + "loss": 0.2566, + "step": 660 + }, + { + "epoch": 1.3622330846411115, + "grad_norm": 0.2072969526052475, + "learning_rate": 7.079037800687286e-06, + "loss": 0.2603, + "step": 661 + }, + { + "epoch": 1.3642912271674814, + "grad_norm": 0.21680790185928345, + "learning_rate": 7.056128293241697e-06, + "loss": 0.2536, + "step": 662 + }, + { + "epoch": 1.3663493696938513, + "grad_norm": 0.2035921961069107, + "learning_rate": 7.0332187857961065e-06, + "loss": 0.2567, + "step": 663 + }, + { + "epoch": 1.3684075122202213, + "grad_norm": 0.21186605095863342, + "learning_rate": 7.010309278350515e-06, + "loss": 0.2575, + "step": 664 + }, + { + "epoch": 1.3704656547465912, + "grad_norm": 0.21388404071331024, + "learning_rate": 6.987399770904926e-06, + "loss": 0.2522, + "step": 665 + }, + { + "epoch": 1.3725237972729611, + "grad_norm": 0.21118783950805664, + "learning_rate": 6.964490263459336e-06, + "loss": 0.25, + "step": 666 + }, + { + "epoch": 1.374581939799331, + "grad_norm": 0.21162322163581848, + "learning_rate": 6.941580756013746e-06, + "loss": 0.253, + "step": 667 + }, + { + "epoch": 1.376640082325701, + "grad_norm": 0.21186329424381256, + "learning_rate": 6.918671248568156e-06, + "loss": 0.2589, + "step": 668 + }, + { + "epoch": 1.378698224852071, + "grad_norm": 0.21206888556480408, + "learning_rate": 6.895761741122567e-06, + "loss": 0.2629, + "step": 669 + }, + { + "epoch": 1.3807563673784409, + "grad_norm": 0.21045179665088654, + "learning_rate": 6.872852233676976e-06, + "loss": 0.2523, + "step": 670 + }, + { + "epoch": 1.382814509904811, + "grad_norm": 0.21106329560279846, + "learning_rate": 6.849942726231387e-06, + "loss": 0.2611, + "step": 671 + }, + { + "epoch": 1.3848726524311807, + "grad_norm": 0.20593757927417755, + "learning_rate": 6.827033218785797e-06, + "loss": 0.2537, + "step": 672 + }, + { + "epoch": 1.386930794957551, + "grad_norm": 0.2040368914604187, + "learning_rate": 6.804123711340207e-06, + "loss": 0.2545, + "step": 673 + }, + { + "epoch": 1.3889889374839208, + "grad_norm": 0.2148980051279068, + "learning_rate": 6.781214203894617e-06, + "loss": 0.264, + "step": 674 + }, + { + "epoch": 1.3910470800102908, + "grad_norm": 0.204456627368927, + "learning_rate": 6.758304696449028e-06, + "loss": 0.2609, + "step": 675 + }, + { + "epoch": 1.3931052225366607, + "grad_norm": 0.20230846107006073, + "learning_rate": 6.735395189003437e-06, + "loss": 0.2644, + "step": 676 + }, + { + "epoch": 1.3951633650630306, + "grad_norm": 0.205158531665802, + "learning_rate": 6.712485681557846e-06, + "loss": 0.2611, + "step": 677 + }, + { + "epoch": 1.3972215075894006, + "grad_norm": 0.21487553417682648, + "learning_rate": 6.689576174112257e-06, + "loss": 0.2492, + "step": 678 + }, + { + "epoch": 1.3992796501157705, + "grad_norm": 0.21277402341365814, + "learning_rate": 6.666666666666667e-06, + "loss": 0.2491, + "step": 679 + }, + { + "epoch": 1.4013377926421404, + "grad_norm": 0.2049219310283661, + "learning_rate": 6.643757159221077e-06, + "loss": 0.2444, + "step": 680 + }, + { + "epoch": 1.4033959351685104, + "grad_norm": 0.23122920095920563, + "learning_rate": 6.620847651775487e-06, + "loss": 0.2522, + "step": 681 + }, + { + "epoch": 1.4054540776948803, + "grad_norm": 0.2067662477493286, + "learning_rate": 6.597938144329898e-06, + "loss": 0.2583, + "step": 682 + }, + { + "epoch": 1.4075122202212502, + "grad_norm": 0.2043958306312561, + "learning_rate": 6.575028636884307e-06, + "loss": 0.2603, + "step": 683 + }, + { + "epoch": 1.4095703627476204, + "grad_norm": 0.21982067823410034, + "learning_rate": 6.552119129438718e-06, + "loss": 0.246, + "step": 684 + }, + { + "epoch": 1.41162850527399, + "grad_norm": 0.21510522067546844, + "learning_rate": 6.529209621993128e-06, + "loss": 0.2554, + "step": 685 + }, + { + "epoch": 1.4136866478003602, + "grad_norm": 0.24448052048683167, + "learning_rate": 6.506300114547538e-06, + "loss": 0.256, + "step": 686 + }, + { + "epoch": 1.4157447903267302, + "grad_norm": 0.2068399339914322, + "learning_rate": 6.483390607101948e-06, + "loss": 0.2566, + "step": 687 + }, + { + "epoch": 1.4178029328531, + "grad_norm": 0.20870736241340637, + "learning_rate": 6.460481099656359e-06, + "loss": 0.2493, + "step": 688 + }, + { + "epoch": 1.41986107537947, + "grad_norm": 0.22065278887748718, + "learning_rate": 6.437571592210768e-06, + "loss": 0.2566, + "step": 689 + }, + { + "epoch": 1.42191921790584, + "grad_norm": 0.21523869037628174, + "learning_rate": 6.414662084765179e-06, + "loss": 0.2579, + "step": 690 + }, + { + "epoch": 1.42397736043221, + "grad_norm": 0.21578392386436462, + "learning_rate": 6.391752577319588e-06, + "loss": 0.2555, + "step": 691 + }, + { + "epoch": 1.4260355029585798, + "grad_norm": 0.2096480280160904, + "learning_rate": 6.3688430698739976e-06, + "loss": 0.2534, + "step": 692 + }, + { + "epoch": 1.4280936454849498, + "grad_norm": 0.21274186670780182, + "learning_rate": 6.345933562428408e-06, + "loss": 0.2521, + "step": 693 + }, + { + "epoch": 1.4301517880113197, + "grad_norm": 0.21426336467266083, + "learning_rate": 6.323024054982818e-06, + "loss": 0.2589, + "step": 694 + }, + { + "epoch": 1.4322099305376899, + "grad_norm": 0.21294309198856354, + "learning_rate": 6.3001145475372285e-06, + "loss": 0.2615, + "step": 695 + }, + { + "epoch": 1.4342680730640596, + "grad_norm": 0.2021908164024353, + "learning_rate": 6.277205040091638e-06, + "loss": 0.2714, + "step": 696 + }, + { + "epoch": 1.4363262155904297, + "grad_norm": 0.21605439484119415, + "learning_rate": 6.254295532646049e-06, + "loss": 0.2592, + "step": 697 + }, + { + "epoch": 1.4383843581167997, + "grad_norm": 0.2154022753238678, + "learning_rate": 6.231386025200459e-06, + "loss": 0.2633, + "step": 698 + }, + { + "epoch": 1.4404425006431696, + "grad_norm": 0.2178344875574112, + "learning_rate": 6.208476517754869e-06, + "loss": 0.2685, + "step": 699 + }, + { + "epoch": 1.4425006431695395, + "grad_norm": 0.21423941850662231, + "learning_rate": 6.185567010309279e-06, + "loss": 0.2474, + "step": 700 + }, + { + "epoch": 1.4425006431695395, + "eval_loss": 0.27773216366767883, + "eval_runtime": 2423.2314, + "eval_samples_per_second": 3.208, + "eval_steps_per_second": 0.802, + "step": 700 + }, + { + "epoch": 1.4445587856959095, + "grad_norm": 0.19836685061454773, + "learning_rate": 6.1626575028636895e-06, + "loss": 0.2556, + "step": 701 + }, + { + "epoch": 1.4466169282222794, + "grad_norm": 0.21015697717666626, + "learning_rate": 6.139747995418099e-06, + "loss": 0.2605, + "step": 702 + }, + { + "epoch": 1.4486750707486493, + "grad_norm": 0.2158636897802353, + "learning_rate": 6.11683848797251e-06, + "loss": 0.252, + "step": 703 + }, + { + "epoch": 1.4507332132750193, + "grad_norm": 0.2136162966489792, + "learning_rate": 6.09392898052692e-06, + "loss": 0.2451, + "step": 704 + }, + { + "epoch": 1.4527913558013892, + "grad_norm": 0.21352505683898926, + "learning_rate": 6.0710194730813285e-06, + "loss": 0.2649, + "step": 705 + }, + { + "epoch": 1.4548494983277591, + "grad_norm": 0.22503146529197693, + "learning_rate": 6.048109965635739e-06, + "loss": 0.2604, + "step": 706 + }, + { + "epoch": 1.456907640854129, + "grad_norm": 0.2114841490983963, + "learning_rate": 6.025200458190149e-06, + "loss": 0.2547, + "step": 707 + }, + { + "epoch": 1.4589657833804992, + "grad_norm": 0.22603987157344818, + "learning_rate": 6.0022909507445594e-06, + "loss": 0.2551, + "step": 708 + }, + { + "epoch": 1.461023925906869, + "grad_norm": 0.2188458889722824, + "learning_rate": 5.979381443298969e-06, + "loss": 0.2553, + "step": 709 + }, + { + "epoch": 1.463082068433239, + "grad_norm": 0.21128129959106445, + "learning_rate": 5.95647193585338e-06, + "loss": 0.258, + "step": 710 + }, + { + "epoch": 1.465140210959609, + "grad_norm": 0.22289037704467773, + "learning_rate": 5.9335624284077895e-06, + "loss": 0.2709, + "step": 711 + }, + { + "epoch": 1.467198353485979, + "grad_norm": 0.21750517189502716, + "learning_rate": 5.9106529209622e-06, + "loss": 0.2597, + "step": 712 + }, + { + "epoch": 1.4692564960123489, + "grad_norm": 0.21022778749465942, + "learning_rate": 5.88774341351661e-06, + "loss": 0.2533, + "step": 713 + }, + { + "epoch": 1.4713146385387188, + "grad_norm": 0.21544480323791504, + "learning_rate": 5.8648339060710204e-06, + "loss": 0.255, + "step": 714 + }, + { + "epoch": 1.4733727810650887, + "grad_norm": 0.20856665074825287, + "learning_rate": 5.84192439862543e-06, + "loss": 0.2648, + "step": 715 + }, + { + "epoch": 1.4754309235914587, + "grad_norm": 0.2105010449886322, + "learning_rate": 5.819014891179841e-06, + "loss": 0.2611, + "step": 716 + }, + { + "epoch": 1.4774890661178286, + "grad_norm": 0.21749204397201538, + "learning_rate": 5.7961053837342505e-06, + "loss": 0.2551, + "step": 717 + }, + { + "epoch": 1.4795472086441985, + "grad_norm": 0.20478859543800354, + "learning_rate": 5.7731958762886594e-06, + "loss": 0.2554, + "step": 718 + }, + { + "epoch": 1.4816053511705687, + "grad_norm": 0.213475301861763, + "learning_rate": 5.75028636884307e-06, + "loss": 0.2622, + "step": 719 + }, + { + "epoch": 1.4836634936969384, + "grad_norm": 0.2008693963289261, + "learning_rate": 5.72737686139748e-06, + "loss": 0.2483, + "step": 720 + }, + { + "epoch": 1.4857216362233086, + "grad_norm": 0.19621135294437408, + "learning_rate": 5.70446735395189e-06, + "loss": 0.2553, + "step": 721 + }, + { + "epoch": 1.4877797787496785, + "grad_norm": 0.227009579539299, + "learning_rate": 5.6815578465063e-06, + "loss": 0.2529, + "step": 722 + }, + { + "epoch": 1.4898379212760484, + "grad_norm": 0.21584804356098175, + "learning_rate": 5.658648339060711e-06, + "loss": 0.2545, + "step": 723 + }, + { + "epoch": 1.4918960638024183, + "grad_norm": 0.2207970768213272, + "learning_rate": 5.6357388316151204e-06, + "loss": 0.2463, + "step": 724 + }, + { + "epoch": 1.4939542063287883, + "grad_norm": 0.22498710453510284, + "learning_rate": 5.612829324169531e-06, + "loss": 0.2593, + "step": 725 + }, + { + "epoch": 1.4960123488551582, + "grad_norm": 0.2146955132484436, + "learning_rate": 5.589919816723941e-06, + "loss": 0.2466, + "step": 726 + }, + { + "epoch": 1.4980704913815281, + "grad_norm": 0.21701963245868683, + "learning_rate": 5.567010309278351e-06, + "loss": 0.2602, + "step": 727 + }, + { + "epoch": 1.500128633907898, + "grad_norm": 0.2154153287410736, + "learning_rate": 5.544100801832761e-06, + "loss": 0.2652, + "step": 728 + }, + { + "epoch": 1.502186776434268, + "grad_norm": 0.2135971337556839, + "learning_rate": 5.521191294387172e-06, + "loss": 0.2465, + "step": 729 + }, + { + "epoch": 1.5042449189606382, + "grad_norm": 0.21887153387069702, + "learning_rate": 5.4982817869415815e-06, + "loss": 0.2553, + "step": 730 + }, + { + "epoch": 1.5063030614870079, + "grad_norm": 0.21986471116542816, + "learning_rate": 5.475372279495992e-06, + "loss": 0.2568, + "step": 731 + }, + { + "epoch": 1.508361204013378, + "grad_norm": 0.2224634885787964, + "learning_rate": 5.452462772050401e-06, + "loss": 0.2609, + "step": 732 + }, + { + "epoch": 1.5104193465397477, + "grad_norm": 0.22347122430801392, + "learning_rate": 5.429553264604811e-06, + "loss": 0.2557, + "step": 733 + }, + { + "epoch": 1.512477489066118, + "grad_norm": 0.21803030371665955, + "learning_rate": 5.406643757159221e-06, + "loss": 0.2622, + "step": 734 + }, + { + "epoch": 1.5145356315924878, + "grad_norm": 0.2078487128019333, + "learning_rate": 5.383734249713631e-06, + "loss": 0.249, + "step": 735 + }, + { + "epoch": 1.5165937741188578, + "grad_norm": 0.20815445482730865, + "learning_rate": 5.360824742268042e-06, + "loss": 0.2538, + "step": 736 + }, + { + "epoch": 1.5186519166452277, + "grad_norm": 0.21298891305923462, + "learning_rate": 5.337915234822451e-06, + "loss": 0.2532, + "step": 737 + }, + { + "epoch": 1.5207100591715976, + "grad_norm": 0.21032264828681946, + "learning_rate": 5.315005727376862e-06, + "loss": 0.2428, + "step": 738 + }, + { + "epoch": 1.5227682016979676, + "grad_norm": 0.23191553354263306, + "learning_rate": 5.292096219931272e-06, + "loss": 0.2545, + "step": 739 + }, + { + "epoch": 1.5248263442243375, + "grad_norm": 0.21168164908885956, + "learning_rate": 5.269186712485682e-06, + "loss": 0.2668, + "step": 740 + }, + { + "epoch": 1.5268844867507076, + "grad_norm": 0.2142658829689026, + "learning_rate": 5.246277205040092e-06, + "loss": 0.2589, + "step": 741 + }, + { + "epoch": 1.5289426292770774, + "grad_norm": 0.2130551040172577, + "learning_rate": 5.223367697594503e-06, + "loss": 0.248, + "step": 742 + }, + { + "epoch": 1.5310007718034475, + "grad_norm": 0.2171664535999298, + "learning_rate": 5.200458190148912e-06, + "loss": 0.2539, + "step": 743 + }, + { + "epoch": 1.5330589143298172, + "grad_norm": 0.21375024318695068, + "learning_rate": 5.177548682703323e-06, + "loss": 0.2471, + "step": 744 + }, + { + "epoch": 1.5351170568561874, + "grad_norm": 0.21037080883979797, + "learning_rate": 5.154639175257732e-06, + "loss": 0.2526, + "step": 745 + }, + { + "epoch": 1.537175199382557, + "grad_norm": 0.2103818953037262, + "learning_rate": 5.131729667812142e-06, + "loss": 0.2609, + "step": 746 + }, + { + "epoch": 1.5392333419089272, + "grad_norm": 0.21307708323001862, + "learning_rate": 5.108820160366552e-06, + "loss": 0.2606, + "step": 747 + }, + { + "epoch": 1.5412914844352972, + "grad_norm": 0.2052801549434662, + "learning_rate": 5.085910652920962e-06, + "loss": 0.2462, + "step": 748 + }, + { + "epoch": 1.543349626961667, + "grad_norm": 0.2059316784143448, + "learning_rate": 5.0630011454753726e-06, + "loss": 0.2593, + "step": 749 + }, + { + "epoch": 1.545407769488037, + "grad_norm": 0.211748406291008, + "learning_rate": 5.040091638029782e-06, + "loss": 0.2582, + "step": 750 + }, + { + "epoch": 1.547465912014407, + "grad_norm": 0.20883141458034515, + "learning_rate": 5.017182130584193e-06, + "loss": 0.251, + "step": 751 + }, + { + "epoch": 1.549524054540777, + "grad_norm": 0.21496839821338654, + "learning_rate": 4.994272623138603e-06, + "loss": 0.2486, + "step": 752 + }, + { + "epoch": 1.5515821970671468, + "grad_norm": 0.21443761885166168, + "learning_rate": 4.971363115693013e-06, + "loss": 0.2541, + "step": 753 + }, + { + "epoch": 1.553640339593517, + "grad_norm": 0.2164083868265152, + "learning_rate": 4.948453608247423e-06, + "loss": 0.2515, + "step": 754 + }, + { + "epoch": 1.5556984821198867, + "grad_norm": 0.22733120620250702, + "learning_rate": 4.9255441008018336e-06, + "loss": 0.2674, + "step": 755 + }, + { + "epoch": 1.5577566246462569, + "grad_norm": 0.21141202747821808, + "learning_rate": 4.902634593356243e-06, + "loss": 0.2586, + "step": 756 + }, + { + "epoch": 1.5598147671726266, + "grad_norm": 0.20612719655036926, + "learning_rate": 4.879725085910653e-06, + "loss": 0.2417, + "step": 757 + }, + { + "epoch": 1.5618729096989967, + "grad_norm": 0.21028929948806763, + "learning_rate": 4.856815578465064e-06, + "loss": 0.2546, + "step": 758 + }, + { + "epoch": 1.5639310522253667, + "grad_norm": 0.2196635901927948, + "learning_rate": 4.833906071019473e-06, + "loss": 0.2527, + "step": 759 + }, + { + "epoch": 1.5659891947517366, + "grad_norm": 0.20016127824783325, + "learning_rate": 4.810996563573884e-06, + "loss": 0.2629, + "step": 760 + }, + { + "epoch": 1.5680473372781065, + "grad_norm": 0.20597878098487854, + "learning_rate": 4.788087056128294e-06, + "loss": 0.2544, + "step": 761 + }, + { + "epoch": 1.5701054798044765, + "grad_norm": 0.20151163637638092, + "learning_rate": 4.7651775486827035e-06, + "loss": 0.2569, + "step": 762 + }, + { + "epoch": 1.5721636223308464, + "grad_norm": 0.21117815375328064, + "learning_rate": 4.742268041237113e-06, + "loss": 0.2602, + "step": 763 + }, + { + "epoch": 1.5742217648572163, + "grad_norm": 0.20184555649757385, + "learning_rate": 4.719358533791524e-06, + "loss": 0.2673, + "step": 764 + }, + { + "epoch": 1.5762799073835865, + "grad_norm": 0.20125100016593933, + "learning_rate": 4.6964490263459336e-06, + "loss": 0.2669, + "step": 765 + }, + { + "epoch": 1.5783380499099562, + "grad_norm": 0.2209872603416443, + "learning_rate": 4.673539518900344e-06, + "loss": 0.2569, + "step": 766 + }, + { + "epoch": 1.5803961924363263, + "grad_norm": 0.21065855026245117, + "learning_rate": 4.650630011454754e-06, + "loss": 0.2477, + "step": 767 + }, + { + "epoch": 1.582454334962696, + "grad_norm": 0.20995444059371948, + "learning_rate": 4.6277205040091645e-06, + "loss": 0.2563, + "step": 768 + }, + { + "epoch": 1.5845124774890662, + "grad_norm": 0.21762295067310333, + "learning_rate": 4.604810996563574e-06, + "loss": 0.2443, + "step": 769 + }, + { + "epoch": 1.586570620015436, + "grad_norm": 0.21741704642772675, + "learning_rate": 4.581901489117984e-06, + "loss": 0.2442, + "step": 770 + }, + { + "epoch": 1.588628762541806, + "grad_norm": 0.21586772799491882, + "learning_rate": 4.5589919816723946e-06, + "loss": 0.2484, + "step": 771 + }, + { + "epoch": 1.590686905068176, + "grad_norm": 0.22184152901172638, + "learning_rate": 4.536082474226804e-06, + "loss": 0.2522, + "step": 772 + }, + { + "epoch": 1.592745047594546, + "grad_norm": 0.22210553288459778, + "learning_rate": 4.513172966781215e-06, + "loss": 0.2552, + "step": 773 + }, + { + "epoch": 1.5948031901209159, + "grad_norm": 0.2075122743844986, + "learning_rate": 4.490263459335625e-06, + "loss": 0.263, + "step": 774 + }, + { + "epoch": 1.5968613326472858, + "grad_norm": 0.20110896229743958, + "learning_rate": 4.467353951890035e-06, + "loss": 0.248, + "step": 775 + }, + { + "epoch": 1.5989194751736557, + "grad_norm": 0.2067912071943283, + "learning_rate": 4.444444444444444e-06, + "loss": 0.2485, + "step": 776 + }, + { + "epoch": 1.6009776177000257, + "grad_norm": 0.2091452181339264, + "learning_rate": 4.421534936998855e-06, + "loss": 0.2783, + "step": 777 + }, + { + "epoch": 1.6030357602263958, + "grad_norm": 0.21414563059806824, + "learning_rate": 4.3986254295532645e-06, + "loss": 0.2502, + "step": 778 + }, + { + "epoch": 1.6050939027527655, + "grad_norm": 0.21657651662826538, + "learning_rate": 4.375715922107675e-06, + "loss": 0.2589, + "step": 779 + }, + { + "epoch": 1.6071520452791357, + "grad_norm": 0.21607093513011932, + "learning_rate": 4.352806414662085e-06, + "loss": 0.2618, + "step": 780 + }, + { + "epoch": 1.6092101878055054, + "grad_norm": 0.21846850216388702, + "learning_rate": 4.329896907216495e-06, + "loss": 0.2549, + "step": 781 + }, + { + "epoch": 1.6112683303318756, + "grad_norm": 0.21873261034488678, + "learning_rate": 4.306987399770905e-06, + "loss": 0.2448, + "step": 782 + }, + { + "epoch": 1.6133264728582455, + "grad_norm": 0.22608645260334015, + "learning_rate": 4.284077892325315e-06, + "loss": 0.2559, + "step": 783 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 0.2121078372001648, + "learning_rate": 4.2611683848797255e-06, + "loss": 0.2515, + "step": 784 + }, + { + "epoch": 1.6174427579109854, + "grad_norm": 0.227590411901474, + "learning_rate": 4.238258877434135e-06, + "loss": 0.2549, + "step": 785 + }, + { + "epoch": 1.6195009004373553, + "grad_norm": 0.201515793800354, + "learning_rate": 4.215349369988546e-06, + "loss": 0.2601, + "step": 786 + }, + { + "epoch": 1.6215590429637252, + "grad_norm": 0.21896880865097046, + "learning_rate": 4.192439862542956e-06, + "loss": 0.2564, + "step": 787 + }, + { + "epoch": 1.6236171854900951, + "grad_norm": 0.21509996056556702, + "learning_rate": 4.169530355097366e-06, + "loss": 0.2491, + "step": 788 + }, + { + "epoch": 1.6256753280164653, + "grad_norm": 0.22020220756530762, + "learning_rate": 4.146620847651776e-06, + "loss": 0.2617, + "step": 789 + }, + { + "epoch": 1.627733470542835, + "grad_norm": 0.21420395374298096, + "learning_rate": 4.123711340206186e-06, + "loss": 0.2561, + "step": 790 + }, + { + "epoch": 1.6297916130692052, + "grad_norm": 0.2270808070898056, + "learning_rate": 4.100801832760596e-06, + "loss": 0.2621, + "step": 791 + }, + { + "epoch": 1.6318497555955749, + "grad_norm": 0.2320822924375534, + "learning_rate": 4.077892325315006e-06, + "loss": 0.269, + "step": 792 + }, + { + "epoch": 1.633907898121945, + "grad_norm": 0.21081334352493286, + "learning_rate": 4.054982817869416e-06, + "loss": 0.2468, + "step": 793 + }, + { + "epoch": 1.6359660406483147, + "grad_norm": 0.2204331010580063, + "learning_rate": 4.032073310423826e-06, + "loss": 0.2578, + "step": 794 + }, + { + "epoch": 1.638024183174685, + "grad_norm": 0.20907023549079895, + "learning_rate": 4.009163802978236e-06, + "loss": 0.2523, + "step": 795 + }, + { + "epoch": 1.6400823257010548, + "grad_norm": 0.23108816146850586, + "learning_rate": 3.986254295532647e-06, + "loss": 0.2619, + "step": 796 + }, + { + "epoch": 1.6421404682274248, + "grad_norm": 0.20781853795051575, + "learning_rate": 3.9633447880870564e-06, + "loss": 0.2496, + "step": 797 + }, + { + "epoch": 1.6441986107537947, + "grad_norm": 0.20635871589183807, + "learning_rate": 3.940435280641466e-06, + "loss": 0.2527, + "step": 798 + }, + { + "epoch": 1.6462567532801646, + "grad_norm": 0.21636071801185608, + "learning_rate": 3.917525773195877e-06, + "loss": 0.2585, + "step": 799 + }, + { + "epoch": 1.6483148958065346, + "grad_norm": 0.20485584437847137, + "learning_rate": 3.8946162657502865e-06, + "loss": 0.259, + "step": 800 + }, + { + "epoch": 1.6483148958065346, + "eval_loss": 0.27598318457603455, + "eval_runtime": 2425.8429, + "eval_samples_per_second": 3.205, + "eval_steps_per_second": 0.801, + "step": 800 + } + ], + "logging_steps": 1, + "max_steps": 970, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.6435290089649766e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/training_args.bin b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5999c7ee9dd10ee9076d748e4757533e635fa832 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee55a11f5a306eb7c39b536fdfe2459bc279e468da50f6adda478c4deffcb812 +size 5688 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/README.md b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0d77d70fdc5c829c8889cb85828736b7eb9714 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/codegemma-7b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/adapter_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e841602c6a59fc7b085ac647af4d4c312445d261 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/codegemma-7b-it", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "q_proj", + "down_proj", + "gate_proj", + "k_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/adapter_model.safetensors b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c4ffd54a85e38adb92b7c4d758703dd2ad3ca7a9 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca7d03cdc662ff79d05c9834799c3915ccc743fb70c098e1a55a0f70c1baa12b +size 800116456 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/optimizer.pt b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bcd0beced4a86c60581fc14b45fd7f8b5309fea8 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a262786ecf93662243225124965cb6f7cbbedeefa2e7d43ddd62c626595a4f6c +size 406743860 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/rng_state.pth b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..9e592b50af4d30623a5c177e1ac193d73b203039 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c4aa5f142e26419c0313b75006c5ad7308aabd3eb589818b54e875dbea3034b +size 14244 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/scheduler.pt b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb9b43e7f59ca31e64f74eedbb1b85dfff0be323 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe8cba74dd0870e683e84bb4c242f3768c0e5606cdbf9b2a51851192fbb5ffa1 +size 1064 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/special_tokens_map.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/tokenizer.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..45a5e23f54141c5f4f97a8d58f3ffadc28e287ba --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d964a2c8346d40f95791533eae48730d5f163c2e65fd16333560fd3e661df318 +size 34362915 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/tokenizer.model b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..71a98ce40269d847e58957e1e070d9ae8eb184af --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:583f2ebd2a1936009b7da991ea255504db68c7a9713a78673d1335a87098966c +size 4241023 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/tokenizer_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9b9b1b4acdd4afcedae39d1cf6f0bc7ef7d9910f --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/tokenizer_config.json @@ -0,0 +1,2011 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "<|file_separator|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "left", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/trainer_state.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4797d841ebdba509b207bc74d3d27a70d23955a0 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/trainer_state.json @@ -0,0 +1,6406 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.8541291484435298, + "eval_steps": 100, + "global_step": 900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0020581425263699513, + "grad_norm": 11.994463920593262, + "learning_rate": 2.061855670103093e-07, + "loss": 2.91, + "step": 1 + }, + { + "epoch": 0.004116285052739903, + "grad_norm": 11.769092559814453, + "learning_rate": 4.123711340206186e-07, + "loss": 2.8686, + "step": 2 + }, + { + "epoch": 0.0061744275791098535, + "grad_norm": 13.05551815032959, + "learning_rate": 6.185567010309279e-07, + "loss": 3.0286, + "step": 3 + }, + { + "epoch": 0.008232570105479805, + "grad_norm": 12.334521293640137, + "learning_rate": 8.247422680412372e-07, + "loss": 2.904, + "step": 4 + }, + { + "epoch": 0.010290712631849755, + "grad_norm": 12.075353622436523, + "learning_rate": 1.0309278350515464e-06, + "loss": 2.8991, + "step": 5 + }, + { + "epoch": 0.012348855158219707, + "grad_norm": 11.86032485961914, + "learning_rate": 1.2371134020618557e-06, + "loss": 3.0007, + "step": 6 + }, + { + "epoch": 0.014406997684589657, + "grad_norm": 10.10457992553711, + "learning_rate": 1.4432989690721649e-06, + "loss": 2.8493, + "step": 7 + }, + { + "epoch": 0.01646514021095961, + "grad_norm": 8.56408405303955, + "learning_rate": 1.6494845360824744e-06, + "loss": 2.9573, + "step": 8 + }, + { + "epoch": 0.01852328273732956, + "grad_norm": 6.307392120361328, + "learning_rate": 1.8556701030927837e-06, + "loss": 2.9507, + "step": 9 + }, + { + "epoch": 0.02058142526369951, + "grad_norm": 4.276430130004883, + "learning_rate": 2.061855670103093e-06, + "loss": 2.8988, + "step": 10 + }, + { + "epoch": 0.022639567790069464, + "grad_norm": 2.5912015438079834, + "learning_rate": 2.268041237113402e-06, + "loss": 2.9926, + "step": 11 + }, + { + "epoch": 0.024697710316439414, + "grad_norm": 2.018446207046509, + "learning_rate": 2.4742268041237115e-06, + "loss": 2.9874, + "step": 12 + }, + { + "epoch": 0.026755852842809364, + "grad_norm": 1.8558588027954102, + "learning_rate": 2.680412371134021e-06, + "loss": 2.8608, + "step": 13 + }, + { + "epoch": 0.028813995369179314, + "grad_norm": 1.9658265113830566, + "learning_rate": 2.8865979381443297e-06, + "loss": 2.8596, + "step": 14 + }, + { + "epoch": 0.030872137895549268, + "grad_norm": 1.872044563293457, + "learning_rate": 3.0927835051546395e-06, + "loss": 2.8836, + "step": 15 + }, + { + "epoch": 0.03293028042191922, + "grad_norm": 1.8884096145629883, + "learning_rate": 3.298969072164949e-06, + "loss": 2.9383, + "step": 16 + }, + { + "epoch": 0.03498842294828917, + "grad_norm": 1.8795744180679321, + "learning_rate": 3.5051546391752577e-06, + "loss": 2.883, + "step": 17 + }, + { + "epoch": 0.03704656547465912, + "grad_norm": 1.783678412437439, + "learning_rate": 3.7113402061855674e-06, + "loss": 2.8019, + "step": 18 + }, + { + "epoch": 0.039104708001029075, + "grad_norm": 1.820617914199829, + "learning_rate": 3.917525773195877e-06, + "loss": 2.8813, + "step": 19 + }, + { + "epoch": 0.04116285052739902, + "grad_norm": 1.8188731670379639, + "learning_rate": 4.123711340206186e-06, + "loss": 2.8401, + "step": 20 + }, + { + "epoch": 0.043220993053768975, + "grad_norm": 1.7305251359939575, + "learning_rate": 4.329896907216495e-06, + "loss": 2.7478, + "step": 21 + }, + { + "epoch": 0.04527913558013893, + "grad_norm": 1.7014551162719727, + "learning_rate": 4.536082474226804e-06, + "loss": 2.7356, + "step": 22 + }, + { + "epoch": 0.047337278106508875, + "grad_norm": 1.677381157875061, + "learning_rate": 4.742268041237113e-06, + "loss": 2.7593, + "step": 23 + }, + { + "epoch": 0.04939542063287883, + "grad_norm": 1.628554344177246, + "learning_rate": 4.948453608247423e-06, + "loss": 2.7689, + "step": 24 + }, + { + "epoch": 0.051453563159248775, + "grad_norm": 1.4968128204345703, + "learning_rate": 5.154639175257732e-06, + "loss": 2.6613, + "step": 25 + }, + { + "epoch": 0.05351170568561873, + "grad_norm": 1.4734832048416138, + "learning_rate": 5.360824742268042e-06, + "loss": 2.7095, + "step": 26 + }, + { + "epoch": 0.05556984821198868, + "grad_norm": 1.3745571374893188, + "learning_rate": 5.567010309278351e-06, + "loss": 2.655, + "step": 27 + }, + { + "epoch": 0.05762799073835863, + "grad_norm": 1.3381729125976562, + "learning_rate": 5.7731958762886594e-06, + "loss": 2.55, + "step": 28 + }, + { + "epoch": 0.05968613326472858, + "grad_norm": 1.3388073444366455, + "learning_rate": 5.979381443298969e-06, + "loss": 2.5219, + "step": 29 + }, + { + "epoch": 0.061744275791098535, + "grad_norm": 1.317008376121521, + "learning_rate": 6.185567010309279e-06, + "loss": 2.4491, + "step": 30 + }, + { + "epoch": 0.06380241831746848, + "grad_norm": 1.3210794925689697, + "learning_rate": 6.391752577319588e-06, + "loss": 2.4358, + "step": 31 + }, + { + "epoch": 0.06586056084383844, + "grad_norm": 1.182519555091858, + "learning_rate": 6.597938144329898e-06, + "loss": 2.4514, + "step": 32 + }, + { + "epoch": 0.06791870337020839, + "grad_norm": 1.2238099575042725, + "learning_rate": 6.804123711340207e-06, + "loss": 2.442, + "step": 33 + }, + { + "epoch": 0.06997684589657834, + "grad_norm": 1.1793314218521118, + "learning_rate": 7.010309278350515e-06, + "loss": 2.3864, + "step": 34 + }, + { + "epoch": 0.0720349884229483, + "grad_norm": 1.1983020305633545, + "learning_rate": 7.216494845360825e-06, + "loss": 2.3796, + "step": 35 + }, + { + "epoch": 0.07409313094931824, + "grad_norm": 1.2189652919769287, + "learning_rate": 7.422680412371135e-06, + "loss": 2.4152, + "step": 36 + }, + { + "epoch": 0.07615127347568819, + "grad_norm": 1.14923095703125, + "learning_rate": 7.628865979381444e-06, + "loss": 2.3298, + "step": 37 + }, + { + "epoch": 0.07820941600205815, + "grad_norm": 1.147013545036316, + "learning_rate": 7.835051546391754e-06, + "loss": 2.2488, + "step": 38 + }, + { + "epoch": 0.0802675585284281, + "grad_norm": 1.133981466293335, + "learning_rate": 8.041237113402063e-06, + "loss": 2.1825, + "step": 39 + }, + { + "epoch": 0.08232570105479804, + "grad_norm": 1.1686867475509644, + "learning_rate": 8.247422680412371e-06, + "loss": 2.2282, + "step": 40 + }, + { + "epoch": 0.084383843581168, + "grad_norm": 1.131690502166748, + "learning_rate": 8.453608247422681e-06, + "loss": 2.0962, + "step": 41 + }, + { + "epoch": 0.08644198610753795, + "grad_norm": 1.1626195907592773, + "learning_rate": 8.65979381443299e-06, + "loss": 2.1161, + "step": 42 + }, + { + "epoch": 0.0885001286339079, + "grad_norm": 1.1508581638336182, + "learning_rate": 8.865979381443299e-06, + "loss": 1.9856, + "step": 43 + }, + { + "epoch": 0.09055827116027786, + "grad_norm": 1.2286733388900757, + "learning_rate": 9.072164948453609e-06, + "loss": 2.076, + "step": 44 + }, + { + "epoch": 0.0926164136866478, + "grad_norm": 1.82068932056427, + "learning_rate": 9.278350515463918e-06, + "loss": 1.9995, + "step": 45 + }, + { + "epoch": 0.09467455621301775, + "grad_norm": 2.079101324081421, + "learning_rate": 9.484536082474226e-06, + "loss": 1.9601, + "step": 46 + }, + { + "epoch": 0.0967326987393877, + "grad_norm": 1.1209226846694946, + "learning_rate": 9.690721649484536e-06, + "loss": 1.9346, + "step": 47 + }, + { + "epoch": 0.09879084126575766, + "grad_norm": 1.0579711198806763, + "learning_rate": 9.896907216494846e-06, + "loss": 1.8764, + "step": 48 + }, + { + "epoch": 0.1008489837921276, + "grad_norm": 1.0434011220932007, + "learning_rate": 1.0103092783505156e-05, + "loss": 1.8483, + "step": 49 + }, + { + "epoch": 0.10290712631849755, + "grad_norm": 1.0089991092681885, + "learning_rate": 1.0309278350515464e-05, + "loss": 1.8018, + "step": 50 + }, + { + "epoch": 0.10496526884486751, + "grad_norm": 1.0117324590682983, + "learning_rate": 1.0515463917525775e-05, + "loss": 1.8003, + "step": 51 + }, + { + "epoch": 0.10702341137123746, + "grad_norm": 1.0006697177886963, + "learning_rate": 1.0721649484536083e-05, + "loss": 1.7482, + "step": 52 + }, + { + "epoch": 0.1090815538976074, + "grad_norm": 2.1164329051971436, + "learning_rate": 1.0927835051546391e-05, + "loss": 1.7363, + "step": 53 + }, + { + "epoch": 0.11113969642397736, + "grad_norm": 0.9573502540588379, + "learning_rate": 1.1134020618556703e-05, + "loss": 1.661, + "step": 54 + }, + { + "epoch": 0.11319783895034731, + "grad_norm": 1.0059764385223389, + "learning_rate": 1.134020618556701e-05, + "loss": 1.6979, + "step": 55 + }, + { + "epoch": 0.11525598147671726, + "grad_norm": 0.9719656109809875, + "learning_rate": 1.1546391752577319e-05, + "loss": 1.6318, + "step": 56 + }, + { + "epoch": 0.11731412400308722, + "grad_norm": 1.0024539232254028, + "learning_rate": 1.175257731958763e-05, + "loss": 1.6283, + "step": 57 + }, + { + "epoch": 0.11937226652945716, + "grad_norm": 0.9772456288337708, + "learning_rate": 1.1958762886597938e-05, + "loss": 1.5611, + "step": 58 + }, + { + "epoch": 0.12143040905582711, + "grad_norm": 0.9947625994682312, + "learning_rate": 1.2164948453608248e-05, + "loss": 1.6073, + "step": 59 + }, + { + "epoch": 0.12348855158219707, + "grad_norm": 2.112889051437378, + "learning_rate": 1.2371134020618558e-05, + "loss": 1.6208, + "step": 60 + }, + { + "epoch": 0.12554669410856703, + "grad_norm": 1.0515345335006714, + "learning_rate": 1.2577319587628866e-05, + "loss": 1.569, + "step": 61 + }, + { + "epoch": 0.12760483663493696, + "grad_norm": 1.0782145261764526, + "learning_rate": 1.2783505154639176e-05, + "loss": 1.5097, + "step": 62 + }, + { + "epoch": 0.12966297916130692, + "grad_norm": 1.154104232788086, + "learning_rate": 1.2989690721649485e-05, + "loss": 1.5472, + "step": 63 + }, + { + "epoch": 0.13172112168767688, + "grad_norm": 1.1614656448364258, + "learning_rate": 1.3195876288659795e-05, + "loss": 1.4833, + "step": 64 + }, + { + "epoch": 0.13377926421404682, + "grad_norm": 1.1720911264419556, + "learning_rate": 1.3402061855670103e-05, + "loss": 1.4644, + "step": 65 + }, + { + "epoch": 0.13583740674041678, + "grad_norm": 1.8903896808624268, + "learning_rate": 1.3608247422680415e-05, + "loss": 1.4286, + "step": 66 + }, + { + "epoch": 0.13789554926678674, + "grad_norm": 1.2675013542175293, + "learning_rate": 1.3814432989690723e-05, + "loss": 1.416, + "step": 67 + }, + { + "epoch": 0.13995369179315667, + "grad_norm": 1.266434907913208, + "learning_rate": 1.402061855670103e-05, + "loss": 1.3171, + "step": 68 + }, + { + "epoch": 0.14201183431952663, + "grad_norm": 1.3408889770507812, + "learning_rate": 1.4226804123711342e-05, + "loss": 1.3396, + "step": 69 + }, + { + "epoch": 0.1440699768458966, + "grad_norm": 1.3862446546554565, + "learning_rate": 1.443298969072165e-05, + "loss": 1.2642, + "step": 70 + }, + { + "epoch": 0.14612811937226652, + "grad_norm": 2.110553026199341, + "learning_rate": 1.4639175257731958e-05, + "loss": 1.2593, + "step": 71 + }, + { + "epoch": 0.14818626189863648, + "grad_norm": 1.7017499208450317, + "learning_rate": 1.484536082474227e-05, + "loss": 1.24, + "step": 72 + }, + { + "epoch": 0.15024440442500644, + "grad_norm": 1.9851700067520142, + "learning_rate": 1.5051546391752578e-05, + "loss": 1.2313, + "step": 73 + }, + { + "epoch": 0.15230254695137638, + "grad_norm": 2.009608030319214, + "learning_rate": 1.5257731958762888e-05, + "loss": 1.1281, + "step": 74 + }, + { + "epoch": 0.15436068947774634, + "grad_norm": 2.7587485313415527, + "learning_rate": 1.5463917525773197e-05, + "loss": 1.1248, + "step": 75 + }, + { + "epoch": 0.1564188320041163, + "grad_norm": 2.780954599380493, + "learning_rate": 1.5670103092783507e-05, + "loss": 1.0797, + "step": 76 + }, + { + "epoch": 0.15847697453048623, + "grad_norm": 3.1470866203308105, + "learning_rate": 1.5876288659793813e-05, + "loss": 1.0064, + "step": 77 + }, + { + "epoch": 0.1605351170568562, + "grad_norm": 4.653595447540283, + "learning_rate": 1.6082474226804127e-05, + "loss": 0.9219, + "step": 78 + }, + { + "epoch": 0.16259325958322615, + "grad_norm": 4.157363414764404, + "learning_rate": 1.6288659793814433e-05, + "loss": 0.8709, + "step": 79 + }, + { + "epoch": 0.16465140210959608, + "grad_norm": 4.5814924240112305, + "learning_rate": 1.6494845360824743e-05, + "loss": 0.7693, + "step": 80 + }, + { + "epoch": 0.16670954463596604, + "grad_norm": 5.096139907836914, + "learning_rate": 1.6701030927835052e-05, + "loss": 0.6868, + "step": 81 + }, + { + "epoch": 0.168767687162336, + "grad_norm": 4.858880519866943, + "learning_rate": 1.6907216494845362e-05, + "loss": 0.5971, + "step": 82 + }, + { + "epoch": 0.17082582968870594, + "grad_norm": 4.42564582824707, + "learning_rate": 1.7113402061855672e-05, + "loss": 0.4719, + "step": 83 + }, + { + "epoch": 0.1728839722150759, + "grad_norm": 7.720851421356201, + "learning_rate": 1.731958762886598e-05, + "loss": 0.3943, + "step": 84 + }, + { + "epoch": 0.17494211474144586, + "grad_norm": 0.41923192143440247, + "learning_rate": 1.752577319587629e-05, + "loss": 0.3635, + "step": 85 + }, + { + "epoch": 0.1770002572678158, + "grad_norm": 0.2771846354007721, + "learning_rate": 1.7731958762886598e-05, + "loss": 0.3597, + "step": 86 + }, + { + "epoch": 0.17905839979418575, + "grad_norm": 0.24761857092380524, + "learning_rate": 1.793814432989691e-05, + "loss": 0.3735, + "step": 87 + }, + { + "epoch": 0.1811165423205557, + "grad_norm": 0.23277048766613007, + "learning_rate": 1.8144329896907217e-05, + "loss": 0.3643, + "step": 88 + }, + { + "epoch": 0.18317468484692565, + "grad_norm": 0.22931228578090668, + "learning_rate": 1.8350515463917527e-05, + "loss": 0.3519, + "step": 89 + }, + { + "epoch": 0.1852328273732956, + "grad_norm": 0.20750615000724792, + "learning_rate": 1.8556701030927837e-05, + "loss": 0.3431, + "step": 90 + }, + { + "epoch": 0.18729096989966554, + "grad_norm": 0.2080322951078415, + "learning_rate": 1.8762886597938147e-05, + "loss": 0.3632, + "step": 91 + }, + { + "epoch": 0.1893491124260355, + "grad_norm": 0.20186181366443634, + "learning_rate": 1.8969072164948453e-05, + "loss": 0.3492, + "step": 92 + }, + { + "epoch": 0.19140725495240546, + "grad_norm": 0.19172786176204681, + "learning_rate": 1.9175257731958766e-05, + "loss": 0.3552, + "step": 93 + }, + { + "epoch": 0.1934653974787754, + "grad_norm": 0.1747850626707077, + "learning_rate": 1.9381443298969072e-05, + "loss": 0.3355, + "step": 94 + }, + { + "epoch": 0.19552354000514535, + "grad_norm": 0.196411594748497, + "learning_rate": 1.9587628865979382e-05, + "loss": 0.3271, + "step": 95 + }, + { + "epoch": 0.1975816825315153, + "grad_norm": 0.20063228905200958, + "learning_rate": 1.9793814432989692e-05, + "loss": 0.3351, + "step": 96 + }, + { + "epoch": 0.19963982505788525, + "grad_norm": 0.19240939617156982, + "learning_rate": 2e-05, + "loss": 0.3266, + "step": 97 + }, + { + "epoch": 0.2016979675842552, + "grad_norm": 0.18206572532653809, + "learning_rate": 1.997709049255441e-05, + "loss": 0.3393, + "step": 98 + }, + { + "epoch": 0.20375611011062517, + "grad_norm": 0.20384562015533447, + "learning_rate": 1.9954180985108823e-05, + "loss": 0.3395, + "step": 99 + }, + { + "epoch": 0.2058142526369951, + "grad_norm": 0.19944581389427185, + "learning_rate": 1.9931271477663232e-05, + "loss": 0.3268, + "step": 100 + }, + { + "epoch": 0.2058142526369951, + "eval_loss": 0.3456890285015106, + "eval_runtime": 2114.0178, + "eval_samples_per_second": 3.677, + "eval_steps_per_second": 0.92, + "step": 100 + }, + { + "epoch": 0.20787239516336506, + "grad_norm": 0.17743557691574097, + "learning_rate": 1.990836197021764e-05, + "loss": 0.3439, + "step": 101 + }, + { + "epoch": 0.20993053768973502, + "grad_norm": 0.18746449053287506, + "learning_rate": 1.9885452462772053e-05, + "loss": 0.326, + "step": 102 + }, + { + "epoch": 0.21198868021610495, + "grad_norm": 0.18555815517902374, + "learning_rate": 1.9862542955326462e-05, + "loss": 0.3337, + "step": 103 + }, + { + "epoch": 0.2140468227424749, + "grad_norm": 0.16591575741767883, + "learning_rate": 1.9839633447880874e-05, + "loss": 0.3121, + "step": 104 + }, + { + "epoch": 0.21610496526884487, + "grad_norm": 0.1621987372636795, + "learning_rate": 1.9816723940435283e-05, + "loss": 0.3287, + "step": 105 + }, + { + "epoch": 0.2181631077952148, + "grad_norm": 0.1614532470703125, + "learning_rate": 1.9793814432989692e-05, + "loss": 0.3306, + "step": 106 + }, + { + "epoch": 0.22022125032158477, + "grad_norm": 0.17993387579917908, + "learning_rate": 1.9770904925544104e-05, + "loss": 0.3341, + "step": 107 + }, + { + "epoch": 0.22227939284795473, + "grad_norm": 0.1550011783838272, + "learning_rate": 1.9747995418098513e-05, + "loss": 0.3197, + "step": 108 + }, + { + "epoch": 0.22433753537432466, + "grad_norm": 0.18471524119377136, + "learning_rate": 1.9725085910652922e-05, + "loss": 0.3285, + "step": 109 + }, + { + "epoch": 0.22639567790069462, + "grad_norm": 0.15604373812675476, + "learning_rate": 1.9702176403207334e-05, + "loss": 0.3298, + "step": 110 + }, + { + "epoch": 0.22845382042706458, + "grad_norm": 0.1682298630475998, + "learning_rate": 1.9679266895761743e-05, + "loss": 0.3343, + "step": 111 + }, + { + "epoch": 0.2305119629534345, + "grad_norm": 0.14933635294437408, + "learning_rate": 1.9656357388316152e-05, + "loss": 0.3134, + "step": 112 + }, + { + "epoch": 0.23257010547980447, + "grad_norm": 0.14892347157001495, + "learning_rate": 1.963344788087056e-05, + "loss": 0.3154, + "step": 113 + }, + { + "epoch": 0.23462824800617443, + "grad_norm": 0.1577889323234558, + "learning_rate": 1.9610538373424973e-05, + "loss": 0.3122, + "step": 114 + }, + { + "epoch": 0.23668639053254437, + "grad_norm": 0.16482344269752502, + "learning_rate": 1.9587628865979382e-05, + "loss": 0.3193, + "step": 115 + }, + { + "epoch": 0.23874453305891433, + "grad_norm": 0.15328913927078247, + "learning_rate": 1.956471935853379e-05, + "loss": 0.3217, + "step": 116 + }, + { + "epoch": 0.2408026755852843, + "grad_norm": 0.16140656173229218, + "learning_rate": 1.9541809851088203e-05, + "loss": 0.318, + "step": 117 + }, + { + "epoch": 0.24286081811165422, + "grad_norm": 0.15448373556137085, + "learning_rate": 1.9518900343642612e-05, + "loss": 0.3205, + "step": 118 + }, + { + "epoch": 0.24491896063802418, + "grad_norm": 0.14716887474060059, + "learning_rate": 1.9495990836197025e-05, + "loss": 0.3164, + "step": 119 + }, + { + "epoch": 0.24697710316439414, + "grad_norm": 0.16582027077674866, + "learning_rate": 1.9473081328751433e-05, + "loss": 0.3191, + "step": 120 + }, + { + "epoch": 0.24903524569076407, + "grad_norm": 0.15213699638843536, + "learning_rate": 1.9450171821305842e-05, + "loss": 0.304, + "step": 121 + }, + { + "epoch": 0.25109338821713406, + "grad_norm": 0.1659238487482071, + "learning_rate": 1.9427262313860255e-05, + "loss": 0.3184, + "step": 122 + }, + { + "epoch": 0.253151530743504, + "grad_norm": 0.15596656501293182, + "learning_rate": 1.9404352806414663e-05, + "loss": 0.3092, + "step": 123 + }, + { + "epoch": 0.2552096732698739, + "grad_norm": 0.15868476033210754, + "learning_rate": 1.9381443298969072e-05, + "loss": 0.3163, + "step": 124 + }, + { + "epoch": 0.2572678157962439, + "grad_norm": 0.15386095643043518, + "learning_rate": 1.9358533791523485e-05, + "loss": 0.3049, + "step": 125 + }, + { + "epoch": 0.25932595832261385, + "grad_norm": 0.15179213881492615, + "learning_rate": 1.9335624284077894e-05, + "loss": 0.3131, + "step": 126 + }, + { + "epoch": 0.2613841008489838, + "grad_norm": 0.1595134735107422, + "learning_rate": 1.9312714776632306e-05, + "loss": 0.3069, + "step": 127 + }, + { + "epoch": 0.26344224337535377, + "grad_norm": 0.16989803314208984, + "learning_rate": 1.9289805269186715e-05, + "loss": 0.3052, + "step": 128 + }, + { + "epoch": 0.2655003859017237, + "grad_norm": 0.14803892374038696, + "learning_rate": 1.9266895761741124e-05, + "loss": 0.3065, + "step": 129 + }, + { + "epoch": 0.26755852842809363, + "grad_norm": 0.16676583886146545, + "learning_rate": 1.9243986254295536e-05, + "loss": 0.2962, + "step": 130 + }, + { + "epoch": 0.2696166709544636, + "grad_norm": 0.15694552659988403, + "learning_rate": 1.9221076746849945e-05, + "loss": 0.3096, + "step": 131 + }, + { + "epoch": 0.27167481348083355, + "grad_norm": 0.17696696519851685, + "learning_rate": 1.9198167239404354e-05, + "loss": 0.3145, + "step": 132 + }, + { + "epoch": 0.2737329560072035, + "grad_norm": 0.17204038798809052, + "learning_rate": 1.9175257731958766e-05, + "loss": 0.3248, + "step": 133 + }, + { + "epoch": 0.2757910985335735, + "grad_norm": 0.15630359947681427, + "learning_rate": 1.9152348224513175e-05, + "loss": 0.3117, + "step": 134 + }, + { + "epoch": 0.2778492410599434, + "grad_norm": 0.15757997334003448, + "learning_rate": 1.9129438717067584e-05, + "loss": 0.3145, + "step": 135 + }, + { + "epoch": 0.27990738358631334, + "grad_norm": 0.16273653507232666, + "learning_rate": 1.9106529209621996e-05, + "loss": 0.3159, + "step": 136 + }, + { + "epoch": 0.28196552611268333, + "grad_norm": 0.16213104128837585, + "learning_rate": 1.9083619702176405e-05, + "loss": 0.2949, + "step": 137 + }, + { + "epoch": 0.28402366863905326, + "grad_norm": 0.15377865731716156, + "learning_rate": 1.9060710194730814e-05, + "loss": 0.306, + "step": 138 + }, + { + "epoch": 0.2860818111654232, + "grad_norm": 0.1545962244272232, + "learning_rate": 1.9037800687285223e-05, + "loss": 0.2966, + "step": 139 + }, + { + "epoch": 0.2881399536917932, + "grad_norm": 0.15516617894172668, + "learning_rate": 1.9014891179839635e-05, + "loss": 0.3122, + "step": 140 + }, + { + "epoch": 0.2901980962181631, + "grad_norm": 0.14734458923339844, + "learning_rate": 1.8991981672394044e-05, + "loss": 0.3118, + "step": 141 + }, + { + "epoch": 0.29225623874453305, + "grad_norm": 0.1644304096698761, + "learning_rate": 1.8969072164948453e-05, + "loss": 0.3027, + "step": 142 + }, + { + "epoch": 0.29431438127090304, + "grad_norm": 0.14632569253444672, + "learning_rate": 1.8946162657502865e-05, + "loss": 0.3023, + "step": 143 + }, + { + "epoch": 0.29637252379727297, + "grad_norm": 0.1573137789964676, + "learning_rate": 1.8923253150057274e-05, + "loss": 0.3102, + "step": 144 + }, + { + "epoch": 0.2984306663236429, + "grad_norm": 0.16423144936561584, + "learning_rate": 1.8900343642611686e-05, + "loss": 0.3033, + "step": 145 + }, + { + "epoch": 0.3004888088500129, + "grad_norm": 0.15420907735824585, + "learning_rate": 1.8877434135166095e-05, + "loss": 0.3089, + "step": 146 + }, + { + "epoch": 0.3025469513763828, + "grad_norm": 0.1579178273677826, + "learning_rate": 1.8854524627720504e-05, + "loss": 0.3071, + "step": 147 + }, + { + "epoch": 0.30460509390275275, + "grad_norm": 0.15866397321224213, + "learning_rate": 1.8831615120274916e-05, + "loss": 0.3083, + "step": 148 + }, + { + "epoch": 0.30666323642912274, + "grad_norm": 0.16651487350463867, + "learning_rate": 1.8808705612829325e-05, + "loss": 0.3099, + "step": 149 + }, + { + "epoch": 0.3087213789554927, + "grad_norm": 0.16281908750534058, + "learning_rate": 1.8785796105383734e-05, + "loss": 0.3034, + "step": 150 + }, + { + "epoch": 0.3107795214818626, + "grad_norm": 0.17449837923049927, + "learning_rate": 1.8762886597938147e-05, + "loss": 0.3054, + "step": 151 + }, + { + "epoch": 0.3128376640082326, + "grad_norm": 0.15403546392917633, + "learning_rate": 1.8739977090492555e-05, + "loss": 0.297, + "step": 152 + }, + { + "epoch": 0.31489580653460253, + "grad_norm": 0.1472466140985489, + "learning_rate": 1.8717067583046968e-05, + "loss": 0.2973, + "step": 153 + }, + { + "epoch": 0.31695394906097246, + "grad_norm": 0.16027937829494476, + "learning_rate": 1.8694158075601377e-05, + "loss": 0.3054, + "step": 154 + }, + { + "epoch": 0.31901209158734245, + "grad_norm": 0.17086225748062134, + "learning_rate": 1.8671248568155786e-05, + "loss": 0.307, + "step": 155 + }, + { + "epoch": 0.3210702341137124, + "grad_norm": 0.15930697321891785, + "learning_rate": 1.8648339060710198e-05, + "loss": 0.293, + "step": 156 + }, + { + "epoch": 0.3231283766400823, + "grad_norm": 0.17086376249790192, + "learning_rate": 1.8625429553264607e-05, + "loss": 0.293, + "step": 157 + }, + { + "epoch": 0.3251865191664523, + "grad_norm": 0.15970875322818756, + "learning_rate": 1.8602520045819016e-05, + "loss": 0.3083, + "step": 158 + }, + { + "epoch": 0.32724466169282224, + "grad_norm": 0.16355909407138824, + "learning_rate": 1.8579610538373428e-05, + "loss": 0.3139, + "step": 159 + }, + { + "epoch": 0.32930280421919217, + "grad_norm": 0.15183711051940918, + "learning_rate": 1.8556701030927837e-05, + "loss": 0.2953, + "step": 160 + }, + { + "epoch": 0.33136094674556216, + "grad_norm": 0.15123715996742249, + "learning_rate": 1.853379152348225e-05, + "loss": 0.3025, + "step": 161 + }, + { + "epoch": 0.3334190892719321, + "grad_norm": 0.1576143503189087, + "learning_rate": 1.8510882016036658e-05, + "loss": 0.2904, + "step": 162 + }, + { + "epoch": 0.335477231798302, + "grad_norm": 0.1457504779100418, + "learning_rate": 1.8487972508591067e-05, + "loss": 0.2909, + "step": 163 + }, + { + "epoch": 0.337535374324672, + "grad_norm": 0.1557442992925644, + "learning_rate": 1.846506300114548e-05, + "loss": 0.3027, + "step": 164 + }, + { + "epoch": 0.33959351685104194, + "grad_norm": 0.15662318468093872, + "learning_rate": 1.8442153493699888e-05, + "loss": 0.311, + "step": 165 + }, + { + "epoch": 0.3416516593774119, + "grad_norm": 0.16177058219909668, + "learning_rate": 1.8419243986254297e-05, + "loss": 0.2944, + "step": 166 + }, + { + "epoch": 0.34370980190378186, + "grad_norm": 0.16406729817390442, + "learning_rate": 1.8396334478808706e-05, + "loss": 0.2927, + "step": 167 + }, + { + "epoch": 0.3457679444301518, + "grad_norm": 0.16642791032791138, + "learning_rate": 1.8373424971363115e-05, + "loss": 0.3063, + "step": 168 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.1650693714618683, + "learning_rate": 1.8350515463917527e-05, + "loss": 0.2957, + "step": 169 + }, + { + "epoch": 0.3498842294828917, + "grad_norm": 0.15349675714969635, + "learning_rate": 1.8327605956471936e-05, + "loss": 0.297, + "step": 170 + }, + { + "epoch": 0.35194237200926165, + "grad_norm": 0.17770209908485413, + "learning_rate": 1.8304696449026348e-05, + "loss": 0.3011, + "step": 171 + }, + { + "epoch": 0.3540005145356316, + "grad_norm": 0.1647631675004959, + "learning_rate": 1.8281786941580757e-05, + "loss": 0.2962, + "step": 172 + }, + { + "epoch": 0.35605865706200157, + "grad_norm": 0.1603834480047226, + "learning_rate": 1.8258877434135166e-05, + "loss": 0.2937, + "step": 173 + }, + { + "epoch": 0.3581167995883715, + "grad_norm": 0.16780880093574524, + "learning_rate": 1.8235967926689578e-05, + "loss": 0.2997, + "step": 174 + }, + { + "epoch": 0.36017494211474144, + "grad_norm": 0.15976767241954803, + "learning_rate": 1.8213058419243987e-05, + "loss": 0.3043, + "step": 175 + }, + { + "epoch": 0.3622330846411114, + "grad_norm": 0.16236485540866852, + "learning_rate": 1.8190148911798396e-05, + "loss": 0.3069, + "step": 176 + }, + { + "epoch": 0.36429122716748136, + "grad_norm": 0.16391968727111816, + "learning_rate": 1.816723940435281e-05, + "loss": 0.2923, + "step": 177 + }, + { + "epoch": 0.3663493696938513, + "grad_norm": 0.15806889533996582, + "learning_rate": 1.8144329896907217e-05, + "loss": 0.2872, + "step": 178 + }, + { + "epoch": 0.3684075122202212, + "grad_norm": 0.1627352088689804, + "learning_rate": 1.812142038946163e-05, + "loss": 0.3032, + "step": 179 + }, + { + "epoch": 0.3704656547465912, + "grad_norm": 0.15103371441364288, + "learning_rate": 1.809851088201604e-05, + "loss": 0.2847, + "step": 180 + }, + { + "epoch": 0.37252379727296114, + "grad_norm": 0.15178488194942474, + "learning_rate": 1.8075601374570447e-05, + "loss": 0.3017, + "step": 181 + }, + { + "epoch": 0.3745819397993311, + "grad_norm": 0.15493899583816528, + "learning_rate": 1.805269186712486e-05, + "loss": 0.2901, + "step": 182 + }, + { + "epoch": 0.37664008232570106, + "grad_norm": 0.15990686416625977, + "learning_rate": 1.802978235967927e-05, + "loss": 0.2861, + "step": 183 + }, + { + "epoch": 0.378698224852071, + "grad_norm": 0.15824148058891296, + "learning_rate": 1.8006872852233677e-05, + "loss": 0.2885, + "step": 184 + }, + { + "epoch": 0.38075636737844093, + "grad_norm": 0.15690775215625763, + "learning_rate": 1.798396334478809e-05, + "loss": 0.2814, + "step": 185 + }, + { + "epoch": 0.3828145099048109, + "grad_norm": 0.15833796560764313, + "learning_rate": 1.79610538373425e-05, + "loss": 0.2847, + "step": 186 + }, + { + "epoch": 0.38487265243118085, + "grad_norm": 0.16560044884681702, + "learning_rate": 1.793814432989691e-05, + "loss": 0.3061, + "step": 187 + }, + { + "epoch": 0.3869307949575508, + "grad_norm": 0.16240179538726807, + "learning_rate": 1.791523482245132e-05, + "loss": 0.2943, + "step": 188 + }, + { + "epoch": 0.38898893748392077, + "grad_norm": 0.15825721621513367, + "learning_rate": 1.789232531500573e-05, + "loss": 0.2934, + "step": 189 + }, + { + "epoch": 0.3910470800102907, + "grad_norm": 0.16665388643741608, + "learning_rate": 1.786941580756014e-05, + "loss": 0.291, + "step": 190 + }, + { + "epoch": 0.39310522253666064, + "grad_norm": 0.16581200063228607, + "learning_rate": 1.784650630011455e-05, + "loss": 0.2849, + "step": 191 + }, + { + "epoch": 0.3951633650630306, + "grad_norm": 0.1604345291852951, + "learning_rate": 1.782359679266896e-05, + "loss": 0.3, + "step": 192 + }, + { + "epoch": 0.39722150758940056, + "grad_norm": 0.16107915341854095, + "learning_rate": 1.7800687285223368e-05, + "loss": 0.2847, + "step": 193 + }, + { + "epoch": 0.3992796501157705, + "grad_norm": 0.1571730375289917, + "learning_rate": 1.7777777777777777e-05, + "loss": 0.2863, + "step": 194 + }, + { + "epoch": 0.4013377926421405, + "grad_norm": 0.1656399518251419, + "learning_rate": 1.775486827033219e-05, + "loss": 0.2878, + "step": 195 + }, + { + "epoch": 0.4033959351685104, + "grad_norm": 0.16738460958003998, + "learning_rate": 1.7731958762886598e-05, + "loss": 0.286, + "step": 196 + }, + { + "epoch": 0.40545407769488034, + "grad_norm": 0.16704292595386505, + "learning_rate": 1.770904925544101e-05, + "loss": 0.2919, + "step": 197 + }, + { + "epoch": 0.40751222022125033, + "grad_norm": 0.16215579211711884, + "learning_rate": 1.768613974799542e-05, + "loss": 0.2874, + "step": 198 + }, + { + "epoch": 0.40957036274762026, + "grad_norm": 0.15573479235172272, + "learning_rate": 1.7663230240549828e-05, + "loss": 0.2904, + "step": 199 + }, + { + "epoch": 0.4116285052739902, + "grad_norm": 0.1707623153924942, + "learning_rate": 1.764032073310424e-05, + "loss": 0.289, + "step": 200 + }, + { + "epoch": 0.4116285052739902, + "eval_loss": 0.3214050829410553, + "eval_runtime": 2449.7742, + "eval_samples_per_second": 3.173, + "eval_steps_per_second": 0.794, + "step": 200 + }, + { + "epoch": 0.4136866478003602, + "grad_norm": 0.1699172556400299, + "learning_rate": 1.761741122565865e-05, + "loss": 0.2852, + "step": 201 + }, + { + "epoch": 0.4157447903267301, + "grad_norm": 0.19150058925151825, + "learning_rate": 1.7594501718213058e-05, + "loss": 0.29, + "step": 202 + }, + { + "epoch": 0.41780293285310005, + "grad_norm": 0.15794627368450165, + "learning_rate": 1.757159221076747e-05, + "loss": 0.2746, + "step": 203 + }, + { + "epoch": 0.41986107537947004, + "grad_norm": 0.17305190861225128, + "learning_rate": 1.754868270332188e-05, + "loss": 0.3003, + "step": 204 + }, + { + "epoch": 0.42191921790583997, + "grad_norm": 0.16257523000240326, + "learning_rate": 1.752577319587629e-05, + "loss": 0.2789, + "step": 205 + }, + { + "epoch": 0.4239773604322099, + "grad_norm": 0.17273619771003723, + "learning_rate": 1.75028636884307e-05, + "loss": 0.2917, + "step": 206 + }, + { + "epoch": 0.4260355029585799, + "grad_norm": 0.17502790689468384, + "learning_rate": 1.747995418098511e-05, + "loss": 0.2992, + "step": 207 + }, + { + "epoch": 0.4280936454849498, + "grad_norm": 0.16464050114154816, + "learning_rate": 1.745704467353952e-05, + "loss": 0.2873, + "step": 208 + }, + { + "epoch": 0.43015178801131976, + "grad_norm": 0.1681668758392334, + "learning_rate": 1.743413516609393e-05, + "loss": 0.2991, + "step": 209 + }, + { + "epoch": 0.43220993053768975, + "grad_norm": 0.16957956552505493, + "learning_rate": 1.741122565864834e-05, + "loss": 0.2868, + "step": 210 + }, + { + "epoch": 0.4342680730640597, + "grad_norm": 0.15875883400440216, + "learning_rate": 1.738831615120275e-05, + "loss": 0.2946, + "step": 211 + }, + { + "epoch": 0.4363262155904296, + "grad_norm": 0.18127889931201935, + "learning_rate": 1.736540664375716e-05, + "loss": 0.2835, + "step": 212 + }, + { + "epoch": 0.4383843581167996, + "grad_norm": 0.17822811007499695, + "learning_rate": 1.7342497136311573e-05, + "loss": 0.2944, + "step": 213 + }, + { + "epoch": 0.44044250064316953, + "grad_norm": 0.17555806040763855, + "learning_rate": 1.731958762886598e-05, + "loss": 0.3001, + "step": 214 + }, + { + "epoch": 0.44250064316953946, + "grad_norm": 0.18709121644496918, + "learning_rate": 1.729667812142039e-05, + "loss": 0.282, + "step": 215 + }, + { + "epoch": 0.44455878569590945, + "grad_norm": 0.16322475671768188, + "learning_rate": 1.7273768613974803e-05, + "loss": 0.2883, + "step": 216 + }, + { + "epoch": 0.4466169282222794, + "grad_norm": 0.1677054911851883, + "learning_rate": 1.7250859106529212e-05, + "loss": 0.28, + "step": 217 + }, + { + "epoch": 0.4486750707486493, + "grad_norm": 0.15764063596725464, + "learning_rate": 1.722794959908362e-05, + "loss": 0.2768, + "step": 218 + }, + { + "epoch": 0.4507332132750193, + "grad_norm": 0.16166841983795166, + "learning_rate": 1.7205040091638033e-05, + "loss": 0.2868, + "step": 219 + }, + { + "epoch": 0.45279135580138924, + "grad_norm": 0.1799350380897522, + "learning_rate": 1.7182130584192442e-05, + "loss": 0.2891, + "step": 220 + }, + { + "epoch": 0.45484949832775917, + "grad_norm": 0.18119174242019653, + "learning_rate": 1.715922107674685e-05, + "loss": 0.2841, + "step": 221 + }, + { + "epoch": 0.45690764085412916, + "grad_norm": 0.17725548148155212, + "learning_rate": 1.713631156930126e-05, + "loss": 0.3038, + "step": 222 + }, + { + "epoch": 0.4589657833804991, + "grad_norm": 0.1628233790397644, + "learning_rate": 1.7113402061855672e-05, + "loss": 0.2868, + "step": 223 + }, + { + "epoch": 0.461023925906869, + "grad_norm": 0.1745166927576065, + "learning_rate": 1.709049255441008e-05, + "loss": 0.3033, + "step": 224 + }, + { + "epoch": 0.463082068433239, + "grad_norm": 0.17708267271518707, + "learning_rate": 1.706758304696449e-05, + "loss": 0.2842, + "step": 225 + }, + { + "epoch": 0.46514021095960895, + "grad_norm": 0.1738453358411789, + "learning_rate": 1.7044673539518902e-05, + "loss": 0.3005, + "step": 226 + }, + { + "epoch": 0.4671983534859789, + "grad_norm": 0.1706874966621399, + "learning_rate": 1.702176403207331e-05, + "loss": 0.2924, + "step": 227 + }, + { + "epoch": 0.46925649601234887, + "grad_norm": 0.1697423756122589, + "learning_rate": 1.699885452462772e-05, + "loss": 0.2783, + "step": 228 + }, + { + "epoch": 0.4713146385387188, + "grad_norm": 0.1783403754234314, + "learning_rate": 1.6975945017182132e-05, + "loss": 0.2924, + "step": 229 + }, + { + "epoch": 0.47337278106508873, + "grad_norm": 0.17431536316871643, + "learning_rate": 1.695303550973654e-05, + "loss": 0.2792, + "step": 230 + }, + { + "epoch": 0.4754309235914587, + "grad_norm": 0.164026141166687, + "learning_rate": 1.6930126002290953e-05, + "loss": 0.2825, + "step": 231 + }, + { + "epoch": 0.47748906611782865, + "grad_norm": 0.16449657082557678, + "learning_rate": 1.6907216494845362e-05, + "loss": 0.2831, + "step": 232 + }, + { + "epoch": 0.4795472086441986, + "grad_norm": 0.1812741607427597, + "learning_rate": 1.688430698739977e-05, + "loss": 0.2849, + "step": 233 + }, + { + "epoch": 0.4816053511705686, + "grad_norm": 0.18431834876537323, + "learning_rate": 1.6861397479954183e-05, + "loss": 0.2802, + "step": 234 + }, + { + "epoch": 0.4836634936969385, + "grad_norm": 0.18349015712738037, + "learning_rate": 1.6838487972508592e-05, + "loss": 0.2804, + "step": 235 + }, + { + "epoch": 0.48572163622330844, + "grad_norm": 0.1769968420267105, + "learning_rate": 1.6815578465063e-05, + "loss": 0.2777, + "step": 236 + }, + { + "epoch": 0.4877797787496784, + "grad_norm": 0.17207500338554382, + "learning_rate": 1.6792668957617413e-05, + "loss": 0.2883, + "step": 237 + }, + { + "epoch": 0.48983792127604836, + "grad_norm": 0.1729692667722702, + "learning_rate": 1.6769759450171822e-05, + "loss": 0.2784, + "step": 238 + }, + { + "epoch": 0.4918960638024183, + "grad_norm": 0.17234881222248077, + "learning_rate": 1.6746849942726235e-05, + "loss": 0.2816, + "step": 239 + }, + { + "epoch": 0.4939542063287883, + "grad_norm": 0.17132551968097687, + "learning_rate": 1.6723940435280644e-05, + "loss": 0.2812, + "step": 240 + }, + { + "epoch": 0.4960123488551582, + "grad_norm": 0.1752254068851471, + "learning_rate": 1.6701030927835052e-05, + "loss": 0.2799, + "step": 241 + }, + { + "epoch": 0.49807049138152815, + "grad_norm": 0.1768665313720703, + "learning_rate": 1.6678121420389465e-05, + "loss": 0.2966, + "step": 242 + }, + { + "epoch": 0.5001286339078981, + "grad_norm": 0.18139514327049255, + "learning_rate": 1.6655211912943874e-05, + "loss": 0.2816, + "step": 243 + }, + { + "epoch": 0.5021867764342681, + "grad_norm": 0.17312943935394287, + "learning_rate": 1.6632302405498283e-05, + "loss": 0.2845, + "step": 244 + }, + { + "epoch": 0.5042449189606381, + "grad_norm": 0.17966389656066895, + "learning_rate": 1.6609392898052695e-05, + "loss": 0.2864, + "step": 245 + }, + { + "epoch": 0.506303061487008, + "grad_norm": 0.16653811931610107, + "learning_rate": 1.6586483390607104e-05, + "loss": 0.2759, + "step": 246 + }, + { + "epoch": 0.5083612040133779, + "grad_norm": 0.1634613424539566, + "learning_rate": 1.6563573883161516e-05, + "loss": 0.2728, + "step": 247 + }, + { + "epoch": 0.5104193465397479, + "grad_norm": 0.17358507215976715, + "learning_rate": 1.654066437571592e-05, + "loss": 0.2706, + "step": 248 + }, + { + "epoch": 0.5124774890661178, + "grad_norm": 0.17524316906929016, + "learning_rate": 1.6517754868270334e-05, + "loss": 0.2805, + "step": 249 + }, + { + "epoch": 0.5145356315924878, + "grad_norm": 0.18134094774723053, + "learning_rate": 1.6494845360824743e-05, + "loss": 0.2909, + "step": 250 + }, + { + "epoch": 0.5165937741188578, + "grad_norm": 0.17795510590076447, + "learning_rate": 1.647193585337915e-05, + "loss": 0.2889, + "step": 251 + }, + { + "epoch": 0.5186519166452277, + "grad_norm": 0.16782547533512115, + "learning_rate": 1.6449026345933564e-05, + "loss": 0.2842, + "step": 252 + }, + { + "epoch": 0.5207100591715976, + "grad_norm": 0.17360062897205353, + "learning_rate": 1.6426116838487973e-05, + "loss": 0.2763, + "step": 253 + }, + { + "epoch": 0.5227682016979676, + "grad_norm": 0.17241406440734863, + "learning_rate": 1.6403207331042385e-05, + "loss": 0.2753, + "step": 254 + }, + { + "epoch": 0.5248263442243375, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.6380297823596794e-05, + "loss": 0.2732, + "step": 255 + }, + { + "epoch": 0.5268844867507075, + "grad_norm": 0.1807374209165573, + "learning_rate": 1.6357388316151203e-05, + "loss": 0.2856, + "step": 256 + }, + { + "epoch": 0.5289426292770775, + "grad_norm": 0.1749904304742813, + "learning_rate": 1.6334478808705615e-05, + "loss": 0.285, + "step": 257 + }, + { + "epoch": 0.5310007718034474, + "grad_norm": 0.16673170030117035, + "learning_rate": 1.6311569301260024e-05, + "loss": 0.2825, + "step": 258 + }, + { + "epoch": 0.5330589143298173, + "grad_norm": 0.17239685356616974, + "learning_rate": 1.6288659793814433e-05, + "loss": 0.2845, + "step": 259 + }, + { + "epoch": 0.5351170568561873, + "grad_norm": 0.1831504851579666, + "learning_rate": 1.6265750286368845e-05, + "loss": 0.2859, + "step": 260 + }, + { + "epoch": 0.5371751993825572, + "grad_norm": 0.18507827818393707, + "learning_rate": 1.6242840778923254e-05, + "loss": 0.293, + "step": 261 + }, + { + "epoch": 0.5392333419089272, + "grad_norm": 0.16738134622573853, + "learning_rate": 1.6219931271477663e-05, + "loss": 0.2853, + "step": 262 + }, + { + "epoch": 0.5412914844352972, + "grad_norm": 0.1701226830482483, + "learning_rate": 1.6197021764032075e-05, + "loss": 0.2763, + "step": 263 + }, + { + "epoch": 0.5433496269616671, + "grad_norm": 0.18195705115795135, + "learning_rate": 1.6174112256586484e-05, + "loss": 0.2797, + "step": 264 + }, + { + "epoch": 0.545407769488037, + "grad_norm": 0.1832309514284134, + "learning_rate": 1.6151202749140896e-05, + "loss": 0.2885, + "step": 265 + }, + { + "epoch": 0.547465912014407, + "grad_norm": 0.1773810088634491, + "learning_rate": 1.6128293241695305e-05, + "loss": 0.2682, + "step": 266 + }, + { + "epoch": 0.5495240545407769, + "grad_norm": 0.16989603638648987, + "learning_rate": 1.6105383734249714e-05, + "loss": 0.2821, + "step": 267 + }, + { + "epoch": 0.551582197067147, + "grad_norm": 0.17835170030593872, + "learning_rate": 1.6082474226804127e-05, + "loss": 0.2774, + "step": 268 + }, + { + "epoch": 0.5536403395935169, + "grad_norm": 0.1777082234621048, + "learning_rate": 1.6059564719358535e-05, + "loss": 0.2726, + "step": 269 + }, + { + "epoch": 0.5556984821198868, + "grad_norm": 0.18766450881958008, + "learning_rate": 1.6036655211912944e-05, + "loss": 0.2879, + "step": 270 + }, + { + "epoch": 0.5577566246462567, + "grad_norm": 0.1868186593055725, + "learning_rate": 1.6013745704467357e-05, + "loss": 0.2808, + "step": 271 + }, + { + "epoch": 0.5598147671726267, + "grad_norm": 0.16695882380008698, + "learning_rate": 1.5990836197021766e-05, + "loss": 0.2668, + "step": 272 + }, + { + "epoch": 0.5618729096989966, + "grad_norm": 0.17224495112895966, + "learning_rate": 1.5967926689576178e-05, + "loss": 0.2682, + "step": 273 + }, + { + "epoch": 0.5639310522253667, + "grad_norm": 0.20116423070430756, + "learning_rate": 1.5945017182130587e-05, + "loss": 0.276, + "step": 274 + }, + { + "epoch": 0.5659891947517366, + "grad_norm": 0.19478343427181244, + "learning_rate": 1.5922107674684996e-05, + "loss": 0.2854, + "step": 275 + }, + { + "epoch": 0.5680473372781065, + "grad_norm": 0.20242950320243835, + "learning_rate": 1.5899198167239405e-05, + "loss": 0.2854, + "step": 276 + }, + { + "epoch": 0.5701054798044765, + "grad_norm": 0.19146093726158142, + "learning_rate": 1.5876288659793813e-05, + "loss": 0.2817, + "step": 277 + }, + { + "epoch": 0.5721636223308464, + "grad_norm": 0.1804896742105484, + "learning_rate": 1.5853379152348226e-05, + "loss": 0.2714, + "step": 278 + }, + { + "epoch": 0.5742217648572163, + "grad_norm": 0.19315646588802338, + "learning_rate": 1.5830469644902635e-05, + "loss": 0.2703, + "step": 279 + }, + { + "epoch": 0.5762799073835864, + "grad_norm": 0.1910266876220703, + "learning_rate": 1.5807560137457047e-05, + "loss": 0.2728, + "step": 280 + }, + { + "epoch": 0.5783380499099563, + "grad_norm": 0.20330773293972015, + "learning_rate": 1.5784650630011456e-05, + "loss": 0.2717, + "step": 281 + }, + { + "epoch": 0.5803961924363262, + "grad_norm": 0.19080683588981628, + "learning_rate": 1.5761741122565865e-05, + "loss": 0.2679, + "step": 282 + }, + { + "epoch": 0.5824543349626962, + "grad_norm": 0.18052135407924652, + "learning_rate": 1.5738831615120277e-05, + "loss": 0.2815, + "step": 283 + }, + { + "epoch": 0.5845124774890661, + "grad_norm": 0.1998361051082611, + "learning_rate": 1.5715922107674686e-05, + "loss": 0.2888, + "step": 284 + }, + { + "epoch": 0.586570620015436, + "grad_norm": 0.1978764683008194, + "learning_rate": 1.5693012600229095e-05, + "loss": 0.2926, + "step": 285 + }, + { + "epoch": 0.5886287625418061, + "grad_norm": 0.17189203202724457, + "learning_rate": 1.5670103092783507e-05, + "loss": 0.2674, + "step": 286 + }, + { + "epoch": 0.590686905068176, + "grad_norm": 0.1937166303396225, + "learning_rate": 1.5647193585337916e-05, + "loss": 0.2838, + "step": 287 + }, + { + "epoch": 0.5927450475945459, + "grad_norm": 0.18978627026081085, + "learning_rate": 1.5624284077892328e-05, + "loss": 0.273, + "step": 288 + }, + { + "epoch": 0.5948031901209159, + "grad_norm": 0.17718705534934998, + "learning_rate": 1.5601374570446737e-05, + "loss": 0.2842, + "step": 289 + }, + { + "epoch": 0.5968613326472858, + "grad_norm": 0.1912536770105362, + "learning_rate": 1.5578465063001146e-05, + "loss": 0.2736, + "step": 290 + }, + { + "epoch": 0.5989194751736557, + "grad_norm": 0.18104907870292664, + "learning_rate": 1.555555555555556e-05, + "loss": 0.274, + "step": 291 + }, + { + "epoch": 0.6009776177000258, + "grad_norm": 0.1620381772518158, + "learning_rate": 1.5532646048109967e-05, + "loss": 0.2663, + "step": 292 + }, + { + "epoch": 0.6030357602263957, + "grad_norm": 0.17973916232585907, + "learning_rate": 1.5509736540664376e-05, + "loss": 0.2791, + "step": 293 + }, + { + "epoch": 0.6050939027527656, + "grad_norm": 0.16821186244487762, + "learning_rate": 1.548682703321879e-05, + "loss": 0.2787, + "step": 294 + }, + { + "epoch": 0.6071520452791356, + "grad_norm": 0.18426693975925446, + "learning_rate": 1.5463917525773197e-05, + "loss": 0.2886, + "step": 295 + }, + { + "epoch": 0.6092101878055055, + "grad_norm": 0.19796033203601837, + "learning_rate": 1.5441008018327606e-05, + "loss": 0.268, + "step": 296 + }, + { + "epoch": 0.6112683303318754, + "grad_norm": 0.1971343755722046, + "learning_rate": 1.541809851088202e-05, + "loss": 0.2761, + "step": 297 + }, + { + "epoch": 0.6133264728582455, + "grad_norm": 0.17458567023277283, + "learning_rate": 1.5395189003436427e-05, + "loss": 0.2831, + "step": 298 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.17610400915145874, + "learning_rate": 1.537227949599084e-05, + "loss": 0.2691, + "step": 299 + }, + { + "epoch": 0.6174427579109854, + "grad_norm": 0.1929042488336563, + "learning_rate": 1.534936998854525e-05, + "loss": 0.2847, + "step": 300 + }, + { + "epoch": 0.6174427579109854, + "eval_loss": 0.2959522604942322, + "eval_runtime": 2428.6339, + "eval_samples_per_second": 3.201, + "eval_steps_per_second": 0.8, + "step": 300 + }, + { + "epoch": 0.6195009004373553, + "grad_norm": 0.19430233538150787, + "learning_rate": 1.5326460481099657e-05, + "loss": 0.279, + "step": 301 + }, + { + "epoch": 0.6215590429637252, + "grad_norm": 0.18542642891407013, + "learning_rate": 1.5303550973654066e-05, + "loss": 0.2695, + "step": 302 + }, + { + "epoch": 0.6236171854900951, + "grad_norm": 0.1850169450044632, + "learning_rate": 1.5280641466208475e-05, + "loss": 0.2847, + "step": 303 + }, + { + "epoch": 0.6256753280164652, + "grad_norm": 0.18449267745018005, + "learning_rate": 1.5257731958762888e-05, + "loss": 0.2804, + "step": 304 + }, + { + "epoch": 0.6277334705428351, + "grad_norm": 0.18608458340168, + "learning_rate": 1.5234822451317296e-05, + "loss": 0.2792, + "step": 305 + }, + { + "epoch": 0.6297916130692051, + "grad_norm": 0.21136076748371124, + "learning_rate": 1.5211912943871707e-05, + "loss": 0.2829, + "step": 306 + }, + { + "epoch": 0.631849755595575, + "grad_norm": 0.19672206044197083, + "learning_rate": 1.5189003436426118e-05, + "loss": 0.2854, + "step": 307 + }, + { + "epoch": 0.6339078981219449, + "grad_norm": 0.1834034025669098, + "learning_rate": 1.5166093928980528e-05, + "loss": 0.2775, + "step": 308 + }, + { + "epoch": 0.6359660406483149, + "grad_norm": 0.18414819240570068, + "learning_rate": 1.5143184421534937e-05, + "loss": 0.2794, + "step": 309 + }, + { + "epoch": 0.6380241831746849, + "grad_norm": 0.1890152245759964, + "learning_rate": 1.5120274914089348e-05, + "loss": 0.2718, + "step": 310 + }, + { + "epoch": 0.6400823257010548, + "grad_norm": 0.18923887610435486, + "learning_rate": 1.5097365406643758e-05, + "loss": 0.2795, + "step": 311 + }, + { + "epoch": 0.6421404682274248, + "grad_norm": 0.20047079026699066, + "learning_rate": 1.5074455899198169e-05, + "loss": 0.2811, + "step": 312 + }, + { + "epoch": 0.6441986107537947, + "grad_norm": 0.1910201609134674, + "learning_rate": 1.5051546391752578e-05, + "loss": 0.2732, + "step": 313 + }, + { + "epoch": 0.6462567532801646, + "grad_norm": 0.2021956443786621, + "learning_rate": 1.5028636884306988e-05, + "loss": 0.2806, + "step": 314 + }, + { + "epoch": 0.6483148958065346, + "grad_norm": 0.18957914412021637, + "learning_rate": 1.5005727376861399e-05, + "loss": 0.2681, + "step": 315 + }, + { + "epoch": 0.6503730383329046, + "grad_norm": 0.19858811795711517, + "learning_rate": 1.498281786941581e-05, + "loss": 0.2805, + "step": 316 + }, + { + "epoch": 0.6524311808592745, + "grad_norm": 0.1731935292482376, + "learning_rate": 1.4959908361970218e-05, + "loss": 0.2646, + "step": 317 + }, + { + "epoch": 0.6544893233856445, + "grad_norm": 0.19619058072566986, + "learning_rate": 1.4936998854524629e-05, + "loss": 0.2965, + "step": 318 + }, + { + "epoch": 0.6565474659120144, + "grad_norm": 0.18745696544647217, + "learning_rate": 1.491408934707904e-05, + "loss": 0.2766, + "step": 319 + }, + { + "epoch": 0.6586056084383843, + "grad_norm": 0.18006449937820435, + "learning_rate": 1.489117983963345e-05, + "loss": 0.2788, + "step": 320 + }, + { + "epoch": 0.6606637509647543, + "grad_norm": 0.17593689262866974, + "learning_rate": 1.486827033218786e-05, + "loss": 0.2813, + "step": 321 + }, + { + "epoch": 0.6627218934911243, + "grad_norm": 0.18695640563964844, + "learning_rate": 1.484536082474227e-05, + "loss": 0.281, + "step": 322 + }, + { + "epoch": 0.6647800360174942, + "grad_norm": 0.17909488081932068, + "learning_rate": 1.482245131729668e-05, + "loss": 0.2814, + "step": 323 + }, + { + "epoch": 0.6668381785438642, + "grad_norm": 0.19074076414108276, + "learning_rate": 1.4799541809851091e-05, + "loss": 0.2721, + "step": 324 + }, + { + "epoch": 0.6688963210702341, + "grad_norm": 0.19175754487514496, + "learning_rate": 1.47766323024055e-05, + "loss": 0.2754, + "step": 325 + }, + { + "epoch": 0.670954463596604, + "grad_norm": 0.18646575510501862, + "learning_rate": 1.475372279495991e-05, + "loss": 0.2678, + "step": 326 + }, + { + "epoch": 0.673012606122974, + "grad_norm": 0.18553243577480316, + "learning_rate": 1.4730813287514321e-05, + "loss": 0.281, + "step": 327 + }, + { + "epoch": 0.675070748649344, + "grad_norm": 0.17120976746082306, + "learning_rate": 1.470790378006873e-05, + "loss": 0.2691, + "step": 328 + }, + { + "epoch": 0.677128891175714, + "grad_norm": 0.19170524179935455, + "learning_rate": 1.4684994272623139e-05, + "loss": 0.2685, + "step": 329 + }, + { + "epoch": 0.6791870337020839, + "grad_norm": 0.1851339191198349, + "learning_rate": 1.466208476517755e-05, + "loss": 0.266, + "step": 330 + }, + { + "epoch": 0.6812451762284538, + "grad_norm": 0.1678062081336975, + "learning_rate": 1.4639175257731958e-05, + "loss": 0.2609, + "step": 331 + }, + { + "epoch": 0.6833033187548238, + "grad_norm": 0.17913252115249634, + "learning_rate": 1.4616265750286369e-05, + "loss": 0.2716, + "step": 332 + }, + { + "epoch": 0.6853614612811937, + "grad_norm": 0.1859239637851715, + "learning_rate": 1.459335624284078e-05, + "loss": 0.2712, + "step": 333 + }, + { + "epoch": 0.6874196038075637, + "grad_norm": 0.18390226364135742, + "learning_rate": 1.457044673539519e-05, + "loss": 0.2827, + "step": 334 + }, + { + "epoch": 0.6894777463339337, + "grad_norm": 0.18520398437976837, + "learning_rate": 1.4547537227949599e-05, + "loss": 0.2721, + "step": 335 + }, + { + "epoch": 0.6915358888603036, + "grad_norm": 0.18416717648506165, + "learning_rate": 1.452462772050401e-05, + "loss": 0.2683, + "step": 336 + }, + { + "epoch": 0.6935940313866735, + "grad_norm": 0.18727894127368927, + "learning_rate": 1.450171821305842e-05, + "loss": 0.2733, + "step": 337 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.18597093224525452, + "learning_rate": 1.447880870561283e-05, + "loss": 0.2708, + "step": 338 + }, + { + "epoch": 0.6977103164394134, + "grad_norm": 0.1786068081855774, + "learning_rate": 1.445589919816724e-05, + "loss": 0.2667, + "step": 339 + }, + { + "epoch": 0.6997684589657834, + "grad_norm": 0.17466600239276886, + "learning_rate": 1.443298969072165e-05, + "loss": 0.2786, + "step": 340 + }, + { + "epoch": 0.7018266014921534, + "grad_norm": 0.185857355594635, + "learning_rate": 1.4410080183276061e-05, + "loss": 0.2759, + "step": 341 + }, + { + "epoch": 0.7038847440185233, + "grad_norm": 0.2004527747631073, + "learning_rate": 1.4387170675830471e-05, + "loss": 0.2847, + "step": 342 + }, + { + "epoch": 0.7059428865448932, + "grad_norm": 0.18774060904979706, + "learning_rate": 1.436426116838488e-05, + "loss": 0.2766, + "step": 343 + }, + { + "epoch": 0.7080010290712632, + "grad_norm": 0.1840328425168991, + "learning_rate": 1.4341351660939291e-05, + "loss": 0.2722, + "step": 344 + }, + { + "epoch": 0.7100591715976331, + "grad_norm": 0.19089624285697937, + "learning_rate": 1.4318442153493702e-05, + "loss": 0.2779, + "step": 345 + }, + { + "epoch": 0.7121173141240031, + "grad_norm": 0.1848018616437912, + "learning_rate": 1.4295532646048112e-05, + "loss": 0.2739, + "step": 346 + }, + { + "epoch": 0.7141754566503731, + "grad_norm": 0.18844038248062134, + "learning_rate": 1.4272623138602521e-05, + "loss": 0.27, + "step": 347 + }, + { + "epoch": 0.716233599176743, + "grad_norm": 0.19289302825927734, + "learning_rate": 1.4249713631156932e-05, + "loss": 0.2743, + "step": 348 + }, + { + "epoch": 0.7182917417031129, + "grad_norm": 0.18738920986652374, + "learning_rate": 1.4226804123711342e-05, + "loss": 0.2657, + "step": 349 + }, + { + "epoch": 0.7203498842294829, + "grad_norm": 0.1925181746482849, + "learning_rate": 1.4203894616265753e-05, + "loss": 0.2637, + "step": 350 + }, + { + "epoch": 0.7224080267558528, + "grad_norm": 0.19114750623703003, + "learning_rate": 1.4180985108820162e-05, + "loss": 0.2758, + "step": 351 + }, + { + "epoch": 0.7244661692822228, + "grad_norm": 0.18310120701789856, + "learning_rate": 1.4158075601374572e-05, + "loss": 0.2777, + "step": 352 + }, + { + "epoch": 0.7265243118085928, + "grad_norm": 0.2045605331659317, + "learning_rate": 1.4135166093928983e-05, + "loss": 0.2653, + "step": 353 + }, + { + "epoch": 0.7285824543349627, + "grad_norm": 0.1856454759836197, + "learning_rate": 1.4112256586483393e-05, + "loss": 0.267, + "step": 354 + }, + { + "epoch": 0.7306405968613326, + "grad_norm": 0.1855366826057434, + "learning_rate": 1.4089347079037802e-05, + "loss": 0.2805, + "step": 355 + }, + { + "epoch": 0.7326987393877026, + "grad_norm": 0.17913414537906647, + "learning_rate": 1.4066437571592213e-05, + "loss": 0.2755, + "step": 356 + }, + { + "epoch": 0.7347568819140725, + "grad_norm": 0.2057684361934662, + "learning_rate": 1.404352806414662e-05, + "loss": 0.2668, + "step": 357 + }, + { + "epoch": 0.7368150244404424, + "grad_norm": 0.190156951546669, + "learning_rate": 1.402061855670103e-05, + "loss": 0.2778, + "step": 358 + }, + { + "epoch": 0.7388731669668125, + "grad_norm": 0.19387219846248627, + "learning_rate": 1.3997709049255441e-05, + "loss": 0.2785, + "step": 359 + }, + { + "epoch": 0.7409313094931824, + "grad_norm": 0.1933836042881012, + "learning_rate": 1.3974799541809852e-05, + "loss": 0.2661, + "step": 360 + }, + { + "epoch": 0.7429894520195524, + "grad_norm": 0.19618812203407288, + "learning_rate": 1.3951890034364261e-05, + "loss": 0.2622, + "step": 361 + }, + { + "epoch": 0.7450475945459223, + "grad_norm": 0.18786942958831787, + "learning_rate": 1.3928980526918671e-05, + "loss": 0.2695, + "step": 362 + }, + { + "epoch": 0.7471057370722922, + "grad_norm": 0.19361330568790436, + "learning_rate": 1.3906071019473082e-05, + "loss": 0.2869, + "step": 363 + }, + { + "epoch": 0.7491638795986622, + "grad_norm": 0.19813291728496552, + "learning_rate": 1.3883161512027493e-05, + "loss": 0.2753, + "step": 364 + }, + { + "epoch": 0.7512220221250322, + "grad_norm": 0.1891734004020691, + "learning_rate": 1.3860252004581902e-05, + "loss": 0.2694, + "step": 365 + }, + { + "epoch": 0.7532801646514021, + "grad_norm": 0.18902742862701416, + "learning_rate": 1.3837342497136312e-05, + "loss": 0.2675, + "step": 366 + }, + { + "epoch": 0.7553383071777721, + "grad_norm": 0.19838480651378632, + "learning_rate": 1.3814432989690723e-05, + "loss": 0.2721, + "step": 367 + }, + { + "epoch": 0.757396449704142, + "grad_norm": 0.20880939066410065, + "learning_rate": 1.3791523482245133e-05, + "loss": 0.2641, + "step": 368 + }, + { + "epoch": 0.7594545922305119, + "grad_norm": 0.20068003237247467, + "learning_rate": 1.3768613974799542e-05, + "loss": 0.2945, + "step": 369 + }, + { + "epoch": 0.7615127347568819, + "grad_norm": 0.19780132174491882, + "learning_rate": 1.3745704467353953e-05, + "loss": 0.2687, + "step": 370 + }, + { + "epoch": 0.7635708772832519, + "grad_norm": 0.19194689393043518, + "learning_rate": 1.3722794959908363e-05, + "loss": 0.2731, + "step": 371 + }, + { + "epoch": 0.7656290198096218, + "grad_norm": 0.19504573941230774, + "learning_rate": 1.3699885452462774e-05, + "loss": 0.2551, + "step": 372 + }, + { + "epoch": 0.7676871623359918, + "grad_norm": 0.18304413557052612, + "learning_rate": 1.3676975945017183e-05, + "loss": 0.2692, + "step": 373 + }, + { + "epoch": 0.7697453048623617, + "grad_norm": 0.2051483392715454, + "learning_rate": 1.3654066437571593e-05, + "loss": 0.2791, + "step": 374 + }, + { + "epoch": 0.7718034473887316, + "grad_norm": 0.18748973309993744, + "learning_rate": 1.3631156930126004e-05, + "loss": 0.2671, + "step": 375 + }, + { + "epoch": 0.7738615899151016, + "grad_norm": 0.19167177379131317, + "learning_rate": 1.3608247422680415e-05, + "loss": 0.2766, + "step": 376 + }, + { + "epoch": 0.7759197324414716, + "grad_norm": 0.17931750416755676, + "learning_rate": 1.3585337915234824e-05, + "loss": 0.2748, + "step": 377 + }, + { + "epoch": 0.7779778749678415, + "grad_norm": 0.19437509775161743, + "learning_rate": 1.3562428407789234e-05, + "loss": 0.2667, + "step": 378 + }, + { + "epoch": 0.7800360174942115, + "grad_norm": 0.19813868403434753, + "learning_rate": 1.3539518900343645e-05, + "loss": 0.2771, + "step": 379 + }, + { + "epoch": 0.7820941600205814, + "grad_norm": 0.19205260276794434, + "learning_rate": 1.3516609392898055e-05, + "loss": 0.2703, + "step": 380 + }, + { + "epoch": 0.7841523025469513, + "grad_norm": 0.19039763510227203, + "learning_rate": 1.3493699885452464e-05, + "loss": 0.264, + "step": 381 + }, + { + "epoch": 0.7862104450733213, + "grad_norm": 0.18269500136375427, + "learning_rate": 1.3470790378006875e-05, + "loss": 0.2653, + "step": 382 + }, + { + "epoch": 0.7882685875996913, + "grad_norm": 0.1922067403793335, + "learning_rate": 1.3447880870561285e-05, + "loss": 0.2754, + "step": 383 + }, + { + "epoch": 0.7903267301260612, + "grad_norm": 0.19615666568279266, + "learning_rate": 1.3424971363115693e-05, + "loss": 0.2811, + "step": 384 + }, + { + "epoch": 0.7923848726524312, + "grad_norm": 0.19037973880767822, + "learning_rate": 1.3402061855670103e-05, + "loss": 0.2673, + "step": 385 + }, + { + "epoch": 0.7944430151788011, + "grad_norm": 0.191124826669693, + "learning_rate": 1.3379152348224514e-05, + "loss": 0.2683, + "step": 386 + }, + { + "epoch": 0.796501157705171, + "grad_norm": 0.18429923057556152, + "learning_rate": 1.3356242840778923e-05, + "loss": 0.2698, + "step": 387 + }, + { + "epoch": 0.798559300231541, + "grad_norm": 0.1839045137166977, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.2895, + "step": 388 + }, + { + "epoch": 0.800617442757911, + "grad_norm": 0.1944131702184677, + "learning_rate": 1.3310423825887744e-05, + "loss": 0.2641, + "step": 389 + }, + { + "epoch": 0.802675585284281, + "grad_norm": 0.20407740771770477, + "learning_rate": 1.3287514318442154e-05, + "loss": 0.2743, + "step": 390 + }, + { + "epoch": 0.8047337278106509, + "grad_norm": 0.1814037561416626, + "learning_rate": 1.3264604810996563e-05, + "loss": 0.2672, + "step": 391 + }, + { + "epoch": 0.8067918703370208, + "grad_norm": 0.1886950582265854, + "learning_rate": 1.3241695303550974e-05, + "loss": 0.2725, + "step": 392 + }, + { + "epoch": 0.8088500128633908, + "grad_norm": 0.19429941475391388, + "learning_rate": 1.3218785796105385e-05, + "loss": 0.2669, + "step": 393 + }, + { + "epoch": 0.8109081553897607, + "grad_norm": 0.19143058359622955, + "learning_rate": 1.3195876288659795e-05, + "loss": 0.2659, + "step": 394 + }, + { + "epoch": 0.8129662979161307, + "grad_norm": 0.2213468849658966, + "learning_rate": 1.3172966781214204e-05, + "loss": 0.2764, + "step": 395 + }, + { + "epoch": 0.8150244404425007, + "grad_norm": 0.2040800005197525, + "learning_rate": 1.3150057273768615e-05, + "loss": 0.2783, + "step": 396 + }, + { + "epoch": 0.8170825829688706, + "grad_norm": 0.1948375254869461, + "learning_rate": 1.3127147766323025e-05, + "loss": 0.2689, + "step": 397 + }, + { + "epoch": 0.8191407254952405, + "grad_norm": 0.1915021538734436, + "learning_rate": 1.3104238258877436e-05, + "loss": 0.2808, + "step": 398 + }, + { + "epoch": 0.8211988680216105, + "grad_norm": 0.19760248064994812, + "learning_rate": 1.3081328751431845e-05, + "loss": 0.2712, + "step": 399 + }, + { + "epoch": 0.8232570105479804, + "grad_norm": 0.2082677185535431, + "learning_rate": 1.3058419243986255e-05, + "loss": 0.2707, + "step": 400 + }, + { + "epoch": 0.8232570105479804, + "eval_loss": 0.28778496384620667, + "eval_runtime": 2427.9096, + "eval_samples_per_second": 3.202, + "eval_steps_per_second": 0.801, + "step": 400 + }, + { + "epoch": 0.8253151530743504, + "grad_norm": 0.19694332778453827, + "learning_rate": 1.3035509736540666e-05, + "loss": 0.2801, + "step": 401 + }, + { + "epoch": 0.8273732956007204, + "grad_norm": 0.19448824226856232, + "learning_rate": 1.3012600229095077e-05, + "loss": 0.2632, + "step": 402 + }, + { + "epoch": 0.8294314381270903, + "grad_norm": 0.18745476007461548, + "learning_rate": 1.2989690721649485e-05, + "loss": 0.2773, + "step": 403 + }, + { + "epoch": 0.8314895806534602, + "grad_norm": 0.19524575769901276, + "learning_rate": 1.2966781214203896e-05, + "loss": 0.2594, + "step": 404 + }, + { + "epoch": 0.8335477231798302, + "grad_norm": 0.19612252712249756, + "learning_rate": 1.2943871706758307e-05, + "loss": 0.271, + "step": 405 + }, + { + "epoch": 0.8356058657062001, + "grad_norm": 0.19964493811130524, + "learning_rate": 1.2920962199312717e-05, + "loss": 0.2615, + "step": 406 + }, + { + "epoch": 0.8376640082325701, + "grad_norm": 0.20115099847316742, + "learning_rate": 1.2898052691867126e-05, + "loss": 0.269, + "step": 407 + }, + { + "epoch": 0.8397221507589401, + "grad_norm": 0.18949687480926514, + "learning_rate": 1.2875143184421537e-05, + "loss": 0.2649, + "step": 408 + }, + { + "epoch": 0.84178029328531, + "grad_norm": 0.1931927353143692, + "learning_rate": 1.2852233676975947e-05, + "loss": 0.2611, + "step": 409 + }, + { + "epoch": 0.8438384358116799, + "grad_norm": 0.18723614513874054, + "learning_rate": 1.2829324169530358e-05, + "loss": 0.2699, + "step": 410 + }, + { + "epoch": 0.8458965783380499, + "grad_norm": 0.19405977427959442, + "learning_rate": 1.2806414662084765e-05, + "loss": 0.2691, + "step": 411 + }, + { + "epoch": 0.8479547208644198, + "grad_norm": 0.2021879404783249, + "learning_rate": 1.2783505154639176e-05, + "loss": 0.267, + "step": 412 + }, + { + "epoch": 0.8500128633907899, + "grad_norm": 0.20015574991703033, + "learning_rate": 1.2760595647193586e-05, + "loss": 0.2632, + "step": 413 + }, + { + "epoch": 0.8520710059171598, + "grad_norm": 0.19090059399604797, + "learning_rate": 1.2737686139747995e-05, + "loss": 0.2743, + "step": 414 + }, + { + "epoch": 0.8541291484435297, + "grad_norm": 0.1906920224428177, + "learning_rate": 1.2714776632302406e-05, + "loss": 0.2723, + "step": 415 + }, + { + "epoch": 0.8561872909698997, + "grad_norm": 0.19348129630088806, + "learning_rate": 1.2691867124856816e-05, + "loss": 0.2656, + "step": 416 + }, + { + "epoch": 0.8582454334962696, + "grad_norm": 0.18771213293075562, + "learning_rate": 1.2668957617411227e-05, + "loss": 0.2617, + "step": 417 + }, + { + "epoch": 0.8603035760226395, + "grad_norm": 0.2135135382413864, + "learning_rate": 1.2646048109965636e-05, + "loss": 0.2773, + "step": 418 + }, + { + "epoch": 0.8623617185490096, + "grad_norm": 0.19689443707466125, + "learning_rate": 1.2623138602520046e-05, + "loss": 0.2623, + "step": 419 + }, + { + "epoch": 0.8644198610753795, + "grad_norm": 0.18752440810203552, + "learning_rate": 1.2600229095074457e-05, + "loss": 0.2599, + "step": 420 + }, + { + "epoch": 0.8664780036017494, + "grad_norm": 0.19264395534992218, + "learning_rate": 1.2577319587628866e-05, + "loss": 0.2707, + "step": 421 + }, + { + "epoch": 0.8685361461281194, + "grad_norm": 0.19980797171592712, + "learning_rate": 1.2554410080183277e-05, + "loss": 0.2616, + "step": 422 + }, + { + "epoch": 0.8705942886544893, + "grad_norm": 0.22940242290496826, + "learning_rate": 1.2531500572737687e-05, + "loss": 0.2712, + "step": 423 + }, + { + "epoch": 0.8726524311808592, + "grad_norm": 0.18825359642505646, + "learning_rate": 1.2508591065292098e-05, + "loss": 0.2779, + "step": 424 + }, + { + "epoch": 0.8747105737072293, + "grad_norm": 0.21553562581539154, + "learning_rate": 1.2485681557846507e-05, + "loss": 0.2677, + "step": 425 + }, + { + "epoch": 0.8767687162335992, + "grad_norm": 0.2025568038225174, + "learning_rate": 1.2462772050400917e-05, + "loss": 0.2659, + "step": 426 + }, + { + "epoch": 0.8788268587599691, + "grad_norm": 0.19179950654506683, + "learning_rate": 1.2439862542955328e-05, + "loss": 0.2762, + "step": 427 + }, + { + "epoch": 0.8808850012863391, + "grad_norm": 0.20982210338115692, + "learning_rate": 1.2416953035509738e-05, + "loss": 0.2648, + "step": 428 + }, + { + "epoch": 0.882943143812709, + "grad_norm": 0.2084280252456665, + "learning_rate": 1.2394043528064147e-05, + "loss": 0.2806, + "step": 429 + }, + { + "epoch": 0.8850012863390789, + "grad_norm": 0.1993308663368225, + "learning_rate": 1.2371134020618558e-05, + "loss": 0.2673, + "step": 430 + }, + { + "epoch": 0.887059428865449, + "grad_norm": 0.1917535811662674, + "learning_rate": 1.2348224513172968e-05, + "loss": 0.2596, + "step": 431 + }, + { + "epoch": 0.8891175713918189, + "grad_norm": 0.18980742990970612, + "learning_rate": 1.2325315005727379e-05, + "loss": 0.2607, + "step": 432 + }, + { + "epoch": 0.8911757139181888, + "grad_norm": 0.21062685549259186, + "learning_rate": 1.2302405498281788e-05, + "loss": 0.2612, + "step": 433 + }, + { + "epoch": 0.8932338564445588, + "grad_norm": 0.20591405034065247, + "learning_rate": 1.2279495990836199e-05, + "loss": 0.2698, + "step": 434 + }, + { + "epoch": 0.8952919989709287, + "grad_norm": 0.2052398920059204, + "learning_rate": 1.2256586483390609e-05, + "loss": 0.2673, + "step": 435 + }, + { + "epoch": 0.8973501414972986, + "grad_norm": 0.19963452219963074, + "learning_rate": 1.223367697594502e-05, + "loss": 0.266, + "step": 436 + }, + { + "epoch": 0.8994082840236687, + "grad_norm": 0.1929163783788681, + "learning_rate": 1.2210767468499429e-05, + "loss": 0.2605, + "step": 437 + }, + { + "epoch": 0.9014664265500386, + "grad_norm": 0.19121681153774261, + "learning_rate": 1.218785796105384e-05, + "loss": 0.2642, + "step": 438 + }, + { + "epoch": 0.9035245690764085, + "grad_norm": 0.18931221961975098, + "learning_rate": 1.2164948453608248e-05, + "loss": 0.2653, + "step": 439 + }, + { + "epoch": 0.9055827116027785, + "grad_norm": 0.21359370648860931, + "learning_rate": 1.2142038946162657e-05, + "loss": 0.264, + "step": 440 + }, + { + "epoch": 0.9076408541291484, + "grad_norm": 0.1874193251132965, + "learning_rate": 1.2119129438717068e-05, + "loss": 0.2664, + "step": 441 + }, + { + "epoch": 0.9096989966555183, + "grad_norm": 0.19697226583957672, + "learning_rate": 1.2096219931271478e-05, + "loss": 0.2651, + "step": 442 + }, + { + "epoch": 0.9117571391818884, + "grad_norm": 0.20930957794189453, + "learning_rate": 1.2073310423825889e-05, + "loss": 0.2724, + "step": 443 + }, + { + "epoch": 0.9138152817082583, + "grad_norm": 0.19588977098464966, + "learning_rate": 1.2050400916380298e-05, + "loss": 0.2648, + "step": 444 + }, + { + "epoch": 0.9158734242346283, + "grad_norm": 0.19452017545700073, + "learning_rate": 1.2027491408934708e-05, + "loss": 0.2808, + "step": 445 + }, + { + "epoch": 0.9179315667609982, + "grad_norm": 0.19226408004760742, + "learning_rate": 1.2004581901489119e-05, + "loss": 0.2627, + "step": 446 + }, + { + "epoch": 0.9199897092873681, + "grad_norm": 0.18108274042606354, + "learning_rate": 1.198167239404353e-05, + "loss": 0.2693, + "step": 447 + }, + { + "epoch": 0.922047851813738, + "grad_norm": 0.19352363049983978, + "learning_rate": 1.1958762886597938e-05, + "loss": 0.2705, + "step": 448 + }, + { + "epoch": 0.9241059943401081, + "grad_norm": 0.18535122275352478, + "learning_rate": 1.1935853379152349e-05, + "loss": 0.2608, + "step": 449 + }, + { + "epoch": 0.926164136866478, + "grad_norm": 0.19209617376327515, + "learning_rate": 1.191294387170676e-05, + "loss": 0.2702, + "step": 450 + }, + { + "epoch": 0.928222279392848, + "grad_norm": 0.1866796910762787, + "learning_rate": 1.189003436426117e-05, + "loss": 0.264, + "step": 451 + }, + { + "epoch": 0.9302804219192179, + "grad_norm": 0.21708665788173676, + "learning_rate": 1.1867124856815579e-05, + "loss": 0.2693, + "step": 452 + }, + { + "epoch": 0.9323385644455878, + "grad_norm": 0.19297796487808228, + "learning_rate": 1.184421534936999e-05, + "loss": 0.2745, + "step": 453 + }, + { + "epoch": 0.9343967069719578, + "grad_norm": 0.19070400297641754, + "learning_rate": 1.18213058419244e-05, + "loss": 0.265, + "step": 454 + }, + { + "epoch": 0.9364548494983278, + "grad_norm": 0.19821566343307495, + "learning_rate": 1.1798396334478809e-05, + "loss": 0.2674, + "step": 455 + }, + { + "epoch": 0.9385129920246977, + "grad_norm": 0.2032192200422287, + "learning_rate": 1.177548682703322e-05, + "loss": 0.276, + "step": 456 + }, + { + "epoch": 0.9405711345510677, + "grad_norm": 0.19127750396728516, + "learning_rate": 1.175257731958763e-05, + "loss": 0.2696, + "step": 457 + }, + { + "epoch": 0.9426292770774376, + "grad_norm": 0.19187286496162415, + "learning_rate": 1.1729667812142041e-05, + "loss": 0.2601, + "step": 458 + }, + { + "epoch": 0.9446874196038075, + "grad_norm": 0.20871371030807495, + "learning_rate": 1.170675830469645e-05, + "loss": 0.2687, + "step": 459 + }, + { + "epoch": 0.9467455621301775, + "grad_norm": 0.19228306412696838, + "learning_rate": 1.168384879725086e-05, + "loss": 0.2633, + "step": 460 + }, + { + "epoch": 0.9488037046565475, + "grad_norm": 0.19025444984436035, + "learning_rate": 1.1660939289805271e-05, + "loss": 0.2721, + "step": 461 + }, + { + "epoch": 0.9508618471829174, + "grad_norm": 0.19476914405822754, + "learning_rate": 1.1638029782359682e-05, + "loss": 0.2662, + "step": 462 + }, + { + "epoch": 0.9529199897092874, + "grad_norm": 0.1991666853427887, + "learning_rate": 1.161512027491409e-05, + "loss": 0.269, + "step": 463 + }, + { + "epoch": 0.9549781322356573, + "grad_norm": 0.19385920464992523, + "learning_rate": 1.1592210767468501e-05, + "loss": 0.2647, + "step": 464 + }, + { + "epoch": 0.9570362747620272, + "grad_norm": 0.1911603957414627, + "learning_rate": 1.1569301260022912e-05, + "loss": 0.2679, + "step": 465 + }, + { + "epoch": 0.9590944172883972, + "grad_norm": 0.20373377203941345, + "learning_rate": 1.1546391752577319e-05, + "loss": 0.2694, + "step": 466 + }, + { + "epoch": 0.9611525598147672, + "grad_norm": 0.20550350844860077, + "learning_rate": 1.152348224513173e-05, + "loss": 0.2677, + "step": 467 + }, + { + "epoch": 0.9632107023411371, + "grad_norm": 0.2049354463815689, + "learning_rate": 1.150057273768614e-05, + "loss": 0.2752, + "step": 468 + }, + { + "epoch": 0.9652688448675071, + "grad_norm": 0.21691595017910004, + "learning_rate": 1.147766323024055e-05, + "loss": 0.2727, + "step": 469 + }, + { + "epoch": 0.967326987393877, + "grad_norm": 0.20727306604385376, + "learning_rate": 1.145475372279496e-05, + "loss": 0.2575, + "step": 470 + }, + { + "epoch": 0.969385129920247, + "grad_norm": 0.19166423380374908, + "learning_rate": 1.143184421534937e-05, + "loss": 0.2716, + "step": 471 + }, + { + "epoch": 0.9714432724466169, + "grad_norm": 0.18833886086940765, + "learning_rate": 1.140893470790378e-05, + "loss": 0.2651, + "step": 472 + }, + { + "epoch": 0.9735014149729869, + "grad_norm": 0.19680088758468628, + "learning_rate": 1.1386025200458191e-05, + "loss": 0.2621, + "step": 473 + }, + { + "epoch": 0.9755595574993569, + "grad_norm": 0.20966476202011108, + "learning_rate": 1.13631156930126e-05, + "loss": 0.2725, + "step": 474 + }, + { + "epoch": 0.9776177000257268, + "grad_norm": 0.1963450163602829, + "learning_rate": 1.134020618556701e-05, + "loss": 0.2569, + "step": 475 + }, + { + "epoch": 0.9796758425520967, + "grad_norm": 0.21289944648742676, + "learning_rate": 1.1317296678121421e-05, + "loss": 0.2622, + "step": 476 + }, + { + "epoch": 0.9817339850784667, + "grad_norm": 0.2103341966867447, + "learning_rate": 1.1294387170675832e-05, + "loss": 0.2803, + "step": 477 + }, + { + "epoch": 0.9837921276048366, + "grad_norm": 0.20202945172786713, + "learning_rate": 1.1271477663230241e-05, + "loss": 0.273, + "step": 478 + }, + { + "epoch": 0.9858502701312066, + "grad_norm": 0.18241006135940552, + "learning_rate": 1.1248568155784651e-05, + "loss": 0.2721, + "step": 479 + }, + { + "epoch": 0.9879084126575766, + "grad_norm": 0.19221259653568268, + "learning_rate": 1.1225658648339062e-05, + "loss": 0.2646, + "step": 480 + }, + { + "epoch": 0.9899665551839465, + "grad_norm": 0.19371837377548218, + "learning_rate": 1.1202749140893473e-05, + "loss": 0.2519, + "step": 481 + }, + { + "epoch": 0.9920246977103164, + "grad_norm": 0.1972094029188156, + "learning_rate": 1.1179839633447882e-05, + "loss": 0.2555, + "step": 482 + }, + { + "epoch": 0.9940828402366864, + "grad_norm": 0.19414126873016357, + "learning_rate": 1.1156930126002292e-05, + "loss": 0.2726, + "step": 483 + }, + { + "epoch": 0.9961409827630563, + "grad_norm": 0.18993492424488068, + "learning_rate": 1.1134020618556703e-05, + "loss": 0.2644, + "step": 484 + }, + { + "epoch": 0.9981991252894263, + "grad_norm": 0.19713927805423737, + "learning_rate": 1.1111111111111113e-05, + "loss": 0.2569, + "step": 485 + }, + { + "epoch": 1.00205814252637, + "grad_norm": 0.3423589766025543, + "learning_rate": 1.1088201603665522e-05, + "loss": 0.5285, + "step": 486 + }, + { + "epoch": 1.0041162850527399, + "grad_norm": 0.1901763528585434, + "learning_rate": 1.1065292096219933e-05, + "loss": 0.2621, + "step": 487 + }, + { + "epoch": 1.0061744275791098, + "grad_norm": 0.20508776605129242, + "learning_rate": 1.1042382588774343e-05, + "loss": 0.2665, + "step": 488 + }, + { + "epoch": 1.0082325701054797, + "grad_norm": 0.20188146829605103, + "learning_rate": 1.1019473081328752e-05, + "loss": 0.2547, + "step": 489 + }, + { + "epoch": 1.0102907126318497, + "grad_norm": 0.20245613157749176, + "learning_rate": 1.0996563573883163e-05, + "loss": 0.2657, + "step": 490 + }, + { + "epoch": 1.0123488551582196, + "grad_norm": 0.19711382687091827, + "learning_rate": 1.0973654066437574e-05, + "loss": 0.2597, + "step": 491 + }, + { + "epoch": 1.0144069976845898, + "grad_norm": 0.21538953483104706, + "learning_rate": 1.0950744558991984e-05, + "loss": 0.2727, + "step": 492 + }, + { + "epoch": 1.0164651402109597, + "grad_norm": 0.20296984910964966, + "learning_rate": 1.0927835051546391e-05, + "loss": 0.2634, + "step": 493 + }, + { + "epoch": 1.0185232827373296, + "grad_norm": 0.20134592056274414, + "learning_rate": 1.0904925544100802e-05, + "loss": 0.2596, + "step": 494 + }, + { + "epoch": 1.0205814252636995, + "grad_norm": 0.200101837515831, + "learning_rate": 1.0882016036655212e-05, + "loss": 0.2575, + "step": 495 + }, + { + "epoch": 1.0226395677900695, + "grad_norm": 0.19144928455352783, + "learning_rate": 1.0859106529209621e-05, + "loss": 0.263, + "step": 496 + }, + { + "epoch": 1.0246977103164394, + "grad_norm": 0.19832482933998108, + "learning_rate": 1.0836197021764032e-05, + "loss": 0.2656, + "step": 497 + }, + { + "epoch": 1.0267558528428093, + "grad_norm": 0.20965202152729034, + "learning_rate": 1.0813287514318443e-05, + "loss": 0.2611, + "step": 498 + }, + { + "epoch": 1.0288139953691793, + "grad_norm": 0.1974337100982666, + "learning_rate": 1.0790378006872853e-05, + "loss": 0.2667, + "step": 499 + }, + { + "epoch": 1.0308721378955492, + "grad_norm": 0.20611713826656342, + "learning_rate": 1.0767468499427262e-05, + "loss": 0.2674, + "step": 500 + }, + { + "epoch": 1.0308721378955492, + "eval_loss": 0.2836935222148895, + "eval_runtime": 2423.44, + "eval_samples_per_second": 3.208, + "eval_steps_per_second": 0.802, + "step": 500 + }, + { + "epoch": 1.0329302804219191, + "grad_norm": 0.202958345413208, + "learning_rate": 1.0744558991981673e-05, + "loss": 0.2684, + "step": 501 + }, + { + "epoch": 1.034988422948289, + "grad_norm": 0.1984429508447647, + "learning_rate": 1.0721649484536083e-05, + "loss": 0.2557, + "step": 502 + }, + { + "epoch": 1.0370465654746592, + "grad_norm": 0.19396482408046722, + "learning_rate": 1.0698739977090494e-05, + "loss": 0.255, + "step": 503 + }, + { + "epoch": 1.0391047080010292, + "grad_norm": 0.19176840782165527, + "learning_rate": 1.0675830469644903e-05, + "loss": 0.2675, + "step": 504 + }, + { + "epoch": 1.041162850527399, + "grad_norm": 0.20167966187000275, + "learning_rate": 1.0652920962199313e-05, + "loss": 0.2669, + "step": 505 + }, + { + "epoch": 1.043220993053769, + "grad_norm": 0.2049783617258072, + "learning_rate": 1.0630011454753724e-05, + "loss": 0.2446, + "step": 506 + }, + { + "epoch": 1.045279135580139, + "grad_norm": 0.19293472170829773, + "learning_rate": 1.0607101947308135e-05, + "loss": 0.256, + "step": 507 + }, + { + "epoch": 1.047337278106509, + "grad_norm": 0.19432370364665985, + "learning_rate": 1.0584192439862543e-05, + "loss": 0.2605, + "step": 508 + }, + { + "epoch": 1.0493954206328788, + "grad_norm": 0.19784876704216003, + "learning_rate": 1.0561282932416954e-05, + "loss": 0.2617, + "step": 509 + }, + { + "epoch": 1.0514535631592488, + "grad_norm": 0.19982090592384338, + "learning_rate": 1.0538373424971365e-05, + "loss": 0.264, + "step": 510 + }, + { + "epoch": 1.0535117056856187, + "grad_norm": 0.2019587755203247, + "learning_rate": 1.0515463917525775e-05, + "loss": 0.2543, + "step": 511 + }, + { + "epoch": 1.0555698482119886, + "grad_norm": 0.19848807156085968, + "learning_rate": 1.0492554410080184e-05, + "loss": 0.2613, + "step": 512 + }, + { + "epoch": 1.0576279907383586, + "grad_norm": 0.20360374450683594, + "learning_rate": 1.0469644902634595e-05, + "loss": 0.2675, + "step": 513 + }, + { + "epoch": 1.0596861332647285, + "grad_norm": 0.19209840893745422, + "learning_rate": 1.0446735395189005e-05, + "loss": 0.2517, + "step": 514 + }, + { + "epoch": 1.0617442757910984, + "grad_norm": 0.19142381846904755, + "learning_rate": 1.0423825887743416e-05, + "loss": 0.2631, + "step": 515 + }, + { + "epoch": 1.0638024183174686, + "grad_norm": 0.20222575962543488, + "learning_rate": 1.0400916380297825e-05, + "loss": 0.2625, + "step": 516 + }, + { + "epoch": 1.0658605608438385, + "grad_norm": 0.1984448879957199, + "learning_rate": 1.0378006872852235e-05, + "loss": 0.2584, + "step": 517 + }, + { + "epoch": 1.0679187033702084, + "grad_norm": 0.1992885023355484, + "learning_rate": 1.0355097365406646e-05, + "loss": 0.2609, + "step": 518 + }, + { + "epoch": 1.0699768458965784, + "grad_norm": 0.20708978176116943, + "learning_rate": 1.0332187857961057e-05, + "loss": 0.2618, + "step": 519 + }, + { + "epoch": 1.0720349884229483, + "grad_norm": 0.22806766629219055, + "learning_rate": 1.0309278350515464e-05, + "loss": 0.2634, + "step": 520 + }, + { + "epoch": 1.0740931309493182, + "grad_norm": 0.2019941806793213, + "learning_rate": 1.0286368843069874e-05, + "loss": 0.2588, + "step": 521 + }, + { + "epoch": 1.0761512734756882, + "grad_norm": 0.19460470974445343, + "learning_rate": 1.0263459335624283e-05, + "loss": 0.2692, + "step": 522 + }, + { + "epoch": 1.078209416002058, + "grad_norm": 0.19483187794685364, + "learning_rate": 1.0240549828178694e-05, + "loss": 0.2474, + "step": 523 + }, + { + "epoch": 1.080267558528428, + "grad_norm": 0.2199576050043106, + "learning_rate": 1.0217640320733104e-05, + "loss": 0.2582, + "step": 524 + }, + { + "epoch": 1.082325701054798, + "grad_norm": 0.20485302805900574, + "learning_rate": 1.0194730813287515e-05, + "loss": 0.2463, + "step": 525 + }, + { + "epoch": 1.084383843581168, + "grad_norm": 0.20773454010486603, + "learning_rate": 1.0171821305841924e-05, + "loss": 0.2501, + "step": 526 + }, + { + "epoch": 1.086441986107538, + "grad_norm": 0.19593262672424316, + "learning_rate": 1.0148911798396335e-05, + "loss": 0.2608, + "step": 527 + }, + { + "epoch": 1.088500128633908, + "grad_norm": 0.20500554144382477, + "learning_rate": 1.0126002290950745e-05, + "loss": 0.2586, + "step": 528 + }, + { + "epoch": 1.090558271160278, + "grad_norm": 0.19919747114181519, + "learning_rate": 1.0103092783505156e-05, + "loss": 0.2724, + "step": 529 + }, + { + "epoch": 1.0926164136866479, + "grad_norm": 0.1953326314687729, + "learning_rate": 1.0080183276059565e-05, + "loss": 0.2456, + "step": 530 + }, + { + "epoch": 1.0946745562130178, + "grad_norm": 0.2155047059059143, + "learning_rate": 1.0057273768613975e-05, + "loss": 0.2644, + "step": 531 + }, + { + "epoch": 1.0967326987393877, + "grad_norm": 0.19747495651245117, + "learning_rate": 1.0034364261168386e-05, + "loss": 0.2539, + "step": 532 + }, + { + "epoch": 1.0987908412657577, + "grad_norm": 0.20261652767658234, + "learning_rate": 1.0011454753722796e-05, + "loss": 0.255, + "step": 533 + }, + { + "epoch": 1.1008489837921276, + "grad_norm": 0.19529719650745392, + "learning_rate": 9.988545246277205e-06, + "loss": 0.2489, + "step": 534 + }, + { + "epoch": 1.1029071263184975, + "grad_norm": 0.20239490270614624, + "learning_rate": 9.965635738831616e-06, + "loss": 0.2664, + "step": 535 + }, + { + "epoch": 1.1049652688448675, + "grad_norm": 0.19377024471759796, + "learning_rate": 9.942726231386026e-06, + "loss": 0.2615, + "step": 536 + }, + { + "epoch": 1.1070234113712374, + "grad_norm": 0.20523156225681305, + "learning_rate": 9.919816723940437e-06, + "loss": 0.2548, + "step": 537 + }, + { + "epoch": 1.1090815538976073, + "grad_norm": 0.2046228051185608, + "learning_rate": 9.896907216494846e-06, + "loss": 0.2704, + "step": 538 + }, + { + "epoch": 1.1111396964239773, + "grad_norm": 0.21209484338760376, + "learning_rate": 9.873997709049257e-06, + "loss": 0.2637, + "step": 539 + }, + { + "epoch": 1.1131978389503474, + "grad_norm": 0.20251420140266418, + "learning_rate": 9.851088201603667e-06, + "loss": 0.2617, + "step": 540 + }, + { + "epoch": 1.1152559814767173, + "grad_norm": 0.21695846319198608, + "learning_rate": 9.828178694158076e-06, + "loss": 0.2658, + "step": 541 + }, + { + "epoch": 1.1173141240030873, + "grad_norm": 0.2015303075313568, + "learning_rate": 9.805269186712487e-06, + "loss": 0.2528, + "step": 542 + }, + { + "epoch": 1.1193722665294572, + "grad_norm": 0.21796390414237976, + "learning_rate": 9.782359679266896e-06, + "loss": 0.2625, + "step": 543 + }, + { + "epoch": 1.1214304090558271, + "grad_norm": 0.20676304399967194, + "learning_rate": 9.759450171821306e-06, + "loss": 0.268, + "step": 544 + }, + { + "epoch": 1.123488551582197, + "grad_norm": 0.1986500769853592, + "learning_rate": 9.736540664375717e-06, + "loss": 0.2546, + "step": 545 + }, + { + "epoch": 1.125546694108567, + "grad_norm": 0.20008589327335358, + "learning_rate": 9.713631156930127e-06, + "loss": 0.2525, + "step": 546 + }, + { + "epoch": 1.127604836634937, + "grad_norm": 0.1891598105430603, + "learning_rate": 9.690721649484536e-06, + "loss": 0.256, + "step": 547 + }, + { + "epoch": 1.1296629791613069, + "grad_norm": 0.20968230068683624, + "learning_rate": 9.667812142038947e-06, + "loss": 0.2495, + "step": 548 + }, + { + "epoch": 1.1317211216876768, + "grad_norm": 0.2025834023952484, + "learning_rate": 9.644902634593357e-06, + "loss": 0.2533, + "step": 549 + }, + { + "epoch": 1.1337792642140467, + "grad_norm": 0.21087367832660675, + "learning_rate": 9.621993127147768e-06, + "loss": 0.2518, + "step": 550 + }, + { + "epoch": 1.1358374067404169, + "grad_norm": 0.20784996449947357, + "learning_rate": 9.599083619702177e-06, + "loss": 0.2594, + "step": 551 + }, + { + "epoch": 1.1378955492667868, + "grad_norm": 0.20754118263721466, + "learning_rate": 9.576174112256587e-06, + "loss": 0.2515, + "step": 552 + }, + { + "epoch": 1.1399536917931568, + "grad_norm": 0.225090891122818, + "learning_rate": 9.553264604810998e-06, + "loss": 0.2615, + "step": 553 + }, + { + "epoch": 1.1420118343195267, + "grad_norm": 0.24656590819358826, + "learning_rate": 9.530355097365407e-06, + "loss": 0.2636, + "step": 554 + }, + { + "epoch": 1.1440699768458966, + "grad_norm": 0.22454337775707245, + "learning_rate": 9.507445589919818e-06, + "loss": 0.2584, + "step": 555 + }, + { + "epoch": 1.1461281193722666, + "grad_norm": 0.2229425013065338, + "learning_rate": 9.484536082474226e-06, + "loss": 0.2543, + "step": 556 + }, + { + "epoch": 1.1481862618986365, + "grad_norm": 0.18805071711540222, + "learning_rate": 9.461626575028637e-06, + "loss": 0.2593, + "step": 557 + }, + { + "epoch": 1.1502444044250064, + "grad_norm": 0.23163346946239471, + "learning_rate": 9.438717067583048e-06, + "loss": 0.2537, + "step": 558 + }, + { + "epoch": 1.1523025469513763, + "grad_norm": 0.2126983255147934, + "learning_rate": 9.415807560137458e-06, + "loss": 0.2598, + "step": 559 + }, + { + "epoch": 1.1543606894777463, + "grad_norm": 0.2113332748413086, + "learning_rate": 9.392898052691867e-06, + "loss": 0.2617, + "step": 560 + }, + { + "epoch": 1.1564188320041162, + "grad_norm": 0.2220505177974701, + "learning_rate": 9.369988545246278e-06, + "loss": 0.2673, + "step": 561 + }, + { + "epoch": 1.1584769745304861, + "grad_norm": 0.21683354675769806, + "learning_rate": 9.347079037800688e-06, + "loss": 0.259, + "step": 562 + }, + { + "epoch": 1.160535117056856, + "grad_norm": 0.20226940512657166, + "learning_rate": 9.324169530355099e-06, + "loss": 0.2536, + "step": 563 + }, + { + "epoch": 1.1625932595832262, + "grad_norm": 0.2166106402873993, + "learning_rate": 9.301260022909508e-06, + "loss": 0.2573, + "step": 564 + }, + { + "epoch": 1.1646514021095962, + "grad_norm": 0.21802830696105957, + "learning_rate": 9.278350515463918e-06, + "loss": 0.2604, + "step": 565 + }, + { + "epoch": 1.166709544635966, + "grad_norm": 0.19723279774188995, + "learning_rate": 9.255441008018329e-06, + "loss": 0.2643, + "step": 566 + }, + { + "epoch": 1.168767687162336, + "grad_norm": 0.20100893080234528, + "learning_rate": 9.23253150057274e-06, + "loss": 0.2601, + "step": 567 + }, + { + "epoch": 1.170825829688706, + "grad_norm": 0.19834032654762268, + "learning_rate": 9.209621993127148e-06, + "loss": 0.2624, + "step": 568 + }, + { + "epoch": 1.172883972215076, + "grad_norm": 0.20677493512630463, + "learning_rate": 9.186712485681557e-06, + "loss": 0.2527, + "step": 569 + }, + { + "epoch": 1.1749421147414458, + "grad_norm": 0.20895297825336456, + "learning_rate": 9.163802978235968e-06, + "loss": 0.2519, + "step": 570 + }, + { + "epoch": 1.1770002572678158, + "grad_norm": 0.19748030602931976, + "learning_rate": 9.140893470790379e-06, + "loss": 0.2567, + "step": 571 + }, + { + "epoch": 1.1790583997941857, + "grad_norm": 0.20713521540164948, + "learning_rate": 9.117983963344789e-06, + "loss": 0.2771, + "step": 572 + }, + { + "epoch": 1.1811165423205556, + "grad_norm": 0.2146754264831543, + "learning_rate": 9.095074455899198e-06, + "loss": 0.2537, + "step": 573 + }, + { + "epoch": 1.1831746848469256, + "grad_norm": 0.20723004639148712, + "learning_rate": 9.072164948453609e-06, + "loss": 0.253, + "step": 574 + }, + { + "epoch": 1.1852328273732957, + "grad_norm": 0.2072172611951828, + "learning_rate": 9.04925544100802e-06, + "loss": 0.2545, + "step": 575 + }, + { + "epoch": 1.1872909698996654, + "grad_norm": 0.20537281036376953, + "learning_rate": 9.02634593356243e-06, + "loss": 0.2517, + "step": 576 + }, + { + "epoch": 1.1893491124260356, + "grad_norm": 0.21034401655197144, + "learning_rate": 9.003436426116839e-06, + "loss": 0.2506, + "step": 577 + }, + { + "epoch": 1.1914072549524055, + "grad_norm": 0.21373845636844635, + "learning_rate": 8.98052691867125e-06, + "loss": 0.2544, + "step": 578 + }, + { + "epoch": 1.1934653974787754, + "grad_norm": 0.22282572090625763, + "learning_rate": 8.95761741122566e-06, + "loss": 0.2607, + "step": 579 + }, + { + "epoch": 1.1955235400051454, + "grad_norm": 0.20421402156352997, + "learning_rate": 8.93470790378007e-06, + "loss": 0.2636, + "step": 580 + }, + { + "epoch": 1.1975816825315153, + "grad_norm": 0.2095903605222702, + "learning_rate": 8.91179839633448e-06, + "loss": 0.2627, + "step": 581 + }, + { + "epoch": 1.1996398250578852, + "grad_norm": 0.2215132862329483, + "learning_rate": 8.888888888888888e-06, + "loss": 0.2651, + "step": 582 + }, + { + "epoch": 1.2016979675842552, + "grad_norm": 0.22536343336105347, + "learning_rate": 8.865979381443299e-06, + "loss": 0.2548, + "step": 583 + }, + { + "epoch": 1.203756110110625, + "grad_norm": 0.19969668984413147, + "learning_rate": 8.84306987399771e-06, + "loss": 0.2646, + "step": 584 + }, + { + "epoch": 1.205814252636995, + "grad_norm": 0.225993350148201, + "learning_rate": 8.82016036655212e-06, + "loss": 0.2607, + "step": 585 + }, + { + "epoch": 1.207872395163365, + "grad_norm": 0.19197311997413635, + "learning_rate": 8.797250859106529e-06, + "loss": 0.2519, + "step": 586 + }, + { + "epoch": 1.209930537689735, + "grad_norm": 0.1974429190158844, + "learning_rate": 8.77434135166094e-06, + "loss": 0.2512, + "step": 587 + }, + { + "epoch": 1.211988680216105, + "grad_norm": 0.19816122949123383, + "learning_rate": 8.75143184421535e-06, + "loss": 0.2582, + "step": 588 + }, + { + "epoch": 1.214046822742475, + "grad_norm": 0.20259711146354675, + "learning_rate": 8.72852233676976e-06, + "loss": 0.2561, + "step": 589 + }, + { + "epoch": 1.216104965268845, + "grad_norm": 0.23857274651527405, + "learning_rate": 8.70561282932417e-06, + "loss": 0.2574, + "step": 590 + }, + { + "epoch": 1.2181631077952149, + "grad_norm": 0.2108597606420517, + "learning_rate": 8.68270332187858e-06, + "loss": 0.2546, + "step": 591 + }, + { + "epoch": 1.2202212503215848, + "grad_norm": 0.20933857560157776, + "learning_rate": 8.65979381443299e-06, + "loss": 0.2527, + "step": 592 + }, + { + "epoch": 1.2222793928479547, + "grad_norm": 0.19276075065135956, + "learning_rate": 8.636884306987401e-06, + "loss": 0.26, + "step": 593 + }, + { + "epoch": 1.2243375353743247, + "grad_norm": 0.2111658900976181, + "learning_rate": 8.61397479954181e-06, + "loss": 0.267, + "step": 594 + }, + { + "epoch": 1.2263956779006946, + "grad_norm": 0.20039953291416168, + "learning_rate": 8.591065292096221e-06, + "loss": 0.2454, + "step": 595 + }, + { + "epoch": 1.2284538204270645, + "grad_norm": 0.212934210896492, + "learning_rate": 8.56815578465063e-06, + "loss": 0.2674, + "step": 596 + }, + { + "epoch": 1.2305119629534345, + "grad_norm": 0.2036072462797165, + "learning_rate": 8.54524627720504e-06, + "loss": 0.2613, + "step": 597 + }, + { + "epoch": 1.2325701054798044, + "grad_norm": 0.20735019445419312, + "learning_rate": 8.522336769759451e-06, + "loss": 0.2648, + "step": 598 + }, + { + "epoch": 1.2346282480061745, + "grad_norm": 0.2097824215888977, + "learning_rate": 8.49942726231386e-06, + "loss": 0.2535, + "step": 599 + }, + { + "epoch": 1.2366863905325443, + "grad_norm": 0.19988034665584564, + "learning_rate": 8.47651775486827e-06, + "loss": 0.2507, + "step": 600 + }, + { + "epoch": 1.2366863905325443, + "eval_loss": 0.28046268224716187, + "eval_runtime": 2441.2385, + "eval_samples_per_second": 3.184, + "eval_steps_per_second": 0.796, + "step": 600 + }, + { + "epoch": 1.2387445330589144, + "grad_norm": 0.20321473479270935, + "learning_rate": 8.453608247422681e-06, + "loss": 0.2588, + "step": 601 + }, + { + "epoch": 1.2408026755852843, + "grad_norm": 0.20362116396427155, + "learning_rate": 8.430698739977092e-06, + "loss": 0.2608, + "step": 602 + }, + { + "epoch": 1.2428608181116543, + "grad_norm": 0.20123381912708282, + "learning_rate": 8.4077892325315e-06, + "loss": 0.2527, + "step": 603 + }, + { + "epoch": 1.2449189606380242, + "grad_norm": 0.2133895605802536, + "learning_rate": 8.384879725085911e-06, + "loss": 0.2731, + "step": 604 + }, + { + "epoch": 1.2469771031643941, + "grad_norm": 0.5265193581581116, + "learning_rate": 8.361970217640322e-06, + "loss": 0.2498, + "step": 605 + }, + { + "epoch": 1.249035245690764, + "grad_norm": 0.2142847776412964, + "learning_rate": 8.339060710194732e-06, + "loss": 0.268, + "step": 606 + }, + { + "epoch": 1.251093388217134, + "grad_norm": 0.19556185603141785, + "learning_rate": 8.316151202749141e-06, + "loss": 0.2587, + "step": 607 + }, + { + "epoch": 1.253151530743504, + "grad_norm": 0.20104384422302246, + "learning_rate": 8.293241695303552e-06, + "loss": 0.248, + "step": 608 + }, + { + "epoch": 1.2552096732698739, + "grad_norm": 0.20386339724063873, + "learning_rate": 8.27033218785796e-06, + "loss": 0.2564, + "step": 609 + }, + { + "epoch": 1.257267815796244, + "grad_norm": 0.21464361250400543, + "learning_rate": 8.247422680412371e-06, + "loss": 0.2651, + "step": 610 + }, + { + "epoch": 1.2593259583226137, + "grad_norm": 0.20295380055904388, + "learning_rate": 8.224513172966782e-06, + "loss": 0.249, + "step": 611 + }, + { + "epoch": 1.261384100848984, + "grad_norm": 0.19431617856025696, + "learning_rate": 8.201603665521193e-06, + "loss": 0.2487, + "step": 612 + }, + { + "epoch": 1.2634422433753538, + "grad_norm": 0.20218072831630707, + "learning_rate": 8.178694158075601e-06, + "loss": 0.2609, + "step": 613 + }, + { + "epoch": 1.2655003859017238, + "grad_norm": 0.20500090718269348, + "learning_rate": 8.155784650630012e-06, + "loss": 0.2705, + "step": 614 + }, + { + "epoch": 1.2675585284280937, + "grad_norm": 0.20803052186965942, + "learning_rate": 8.132875143184423e-06, + "loss": 0.2525, + "step": 615 + }, + { + "epoch": 1.2696166709544636, + "grad_norm": 0.2087874561548233, + "learning_rate": 8.109965635738832e-06, + "loss": 0.2541, + "step": 616 + }, + { + "epoch": 1.2716748134808336, + "grad_norm": 0.2055324912071228, + "learning_rate": 8.087056128293242e-06, + "loss": 0.2647, + "step": 617 + }, + { + "epoch": 1.2737329560072035, + "grad_norm": 0.20352068543434143, + "learning_rate": 8.064146620847653e-06, + "loss": 0.2666, + "step": 618 + }, + { + "epoch": 1.2757910985335734, + "grad_norm": 0.20651914179325104, + "learning_rate": 8.041237113402063e-06, + "loss": 0.2525, + "step": 619 + }, + { + "epoch": 1.2778492410599434, + "grad_norm": 0.2097817212343216, + "learning_rate": 8.018327605956472e-06, + "loss": 0.2576, + "step": 620 + }, + { + "epoch": 1.2799073835863133, + "grad_norm": 0.20695503056049347, + "learning_rate": 7.995418098510883e-06, + "loss": 0.2633, + "step": 621 + }, + { + "epoch": 1.2819655261126832, + "grad_norm": 0.20550110936164856, + "learning_rate": 7.972508591065293e-06, + "loss": 0.2629, + "step": 622 + }, + { + "epoch": 1.2840236686390534, + "grad_norm": 0.2035083919763565, + "learning_rate": 7.949599083619702e-06, + "loss": 0.2566, + "step": 623 + }, + { + "epoch": 1.286081811165423, + "grad_norm": 0.21426044404506683, + "learning_rate": 7.926689576174113e-06, + "loss": 0.2636, + "step": 624 + }, + { + "epoch": 1.2881399536917932, + "grad_norm": 0.20519520342350006, + "learning_rate": 7.903780068728523e-06, + "loss": 0.2665, + "step": 625 + }, + { + "epoch": 1.2901980962181632, + "grad_norm": 0.2012549638748169, + "learning_rate": 7.880870561282932e-06, + "loss": 0.2588, + "step": 626 + }, + { + "epoch": 1.292256238744533, + "grad_norm": 0.19951675832271576, + "learning_rate": 7.857961053837343e-06, + "loss": 0.2592, + "step": 627 + }, + { + "epoch": 1.294314381270903, + "grad_norm": 0.21163856983184814, + "learning_rate": 7.835051546391754e-06, + "loss": 0.26, + "step": 628 + }, + { + "epoch": 1.296372523797273, + "grad_norm": 0.21543577313423157, + "learning_rate": 7.812142038946164e-06, + "loss": 0.2486, + "step": 629 + }, + { + "epoch": 1.298430666323643, + "grad_norm": 0.20984649658203125, + "learning_rate": 7.789232531500573e-06, + "loss": 0.2603, + "step": 630 + }, + { + "epoch": 1.3004888088500128, + "grad_norm": 0.20047229528427124, + "learning_rate": 7.766323024054984e-06, + "loss": 0.2559, + "step": 631 + }, + { + "epoch": 1.3025469513763828, + "grad_norm": 0.21747010946273804, + "learning_rate": 7.743413516609394e-06, + "loss": 0.2563, + "step": 632 + }, + { + "epoch": 1.3046050939027527, + "grad_norm": 0.20818108320236206, + "learning_rate": 7.720504009163803e-06, + "loss": 0.2507, + "step": 633 + }, + { + "epoch": 1.3066632364291229, + "grad_norm": 0.19827309250831604, + "learning_rate": 7.697594501718214e-06, + "loss": 0.2578, + "step": 634 + }, + { + "epoch": 1.3087213789554926, + "grad_norm": 0.2122543305158615, + "learning_rate": 7.674684994272624e-06, + "loss": 0.2633, + "step": 635 + }, + { + "epoch": 1.3107795214818627, + "grad_norm": 0.20870576798915863, + "learning_rate": 7.651775486827033e-06, + "loss": 0.2616, + "step": 636 + }, + { + "epoch": 1.3128376640082327, + "grad_norm": 0.2069362998008728, + "learning_rate": 7.628865979381444e-06, + "loss": 0.2426, + "step": 637 + }, + { + "epoch": 1.3148958065346026, + "grad_norm": 0.19999894499778748, + "learning_rate": 7.6059564719358535e-06, + "loss": 0.2547, + "step": 638 + }, + { + "epoch": 1.3169539490609725, + "grad_norm": 0.20518334209918976, + "learning_rate": 7.583046964490264e-06, + "loss": 0.2571, + "step": 639 + }, + { + "epoch": 1.3190120915873425, + "grad_norm": 0.20558986067771912, + "learning_rate": 7.560137457044674e-06, + "loss": 0.2483, + "step": 640 + }, + { + "epoch": 1.3210702341137124, + "grad_norm": 0.21443884074687958, + "learning_rate": 7.5372279495990845e-06, + "loss": 0.2494, + "step": 641 + }, + { + "epoch": 1.3231283766400823, + "grad_norm": 0.2025483101606369, + "learning_rate": 7.514318442153494e-06, + "loss": 0.2473, + "step": 642 + }, + { + "epoch": 1.3251865191664522, + "grad_norm": 0.21094976365566254, + "learning_rate": 7.491408934707905e-06, + "loss": 0.2603, + "step": 643 + }, + { + "epoch": 1.3272446616928222, + "grad_norm": 0.2047881782054901, + "learning_rate": 7.4684994272623145e-06, + "loss": 0.2601, + "step": 644 + }, + { + "epoch": 1.3293028042191921, + "grad_norm": 0.2075866013765335, + "learning_rate": 7.445589919816725e-06, + "loss": 0.2644, + "step": 645 + }, + { + "epoch": 1.331360946745562, + "grad_norm": 0.2174414098262787, + "learning_rate": 7.422680412371135e-06, + "loss": 0.2609, + "step": 646 + }, + { + "epoch": 1.3334190892719322, + "grad_norm": 0.20820266008377075, + "learning_rate": 7.3997709049255455e-06, + "loss": 0.2535, + "step": 647 + }, + { + "epoch": 1.335477231798302, + "grad_norm": 0.20941515266895294, + "learning_rate": 7.376861397479955e-06, + "loss": 0.2578, + "step": 648 + }, + { + "epoch": 1.337535374324672, + "grad_norm": 0.2027975171804428, + "learning_rate": 7.353951890034365e-06, + "loss": 0.2573, + "step": 649 + }, + { + "epoch": 1.339593516851042, + "grad_norm": 0.209550142288208, + "learning_rate": 7.331042382588775e-06, + "loss": 0.2513, + "step": 650 + }, + { + "epoch": 1.341651659377412, + "grad_norm": 0.21425557136535645, + "learning_rate": 7.3081328751431845e-06, + "loss": 0.2568, + "step": 651 + }, + { + "epoch": 1.3437098019037819, + "grad_norm": 0.22760476171970367, + "learning_rate": 7.285223367697595e-06, + "loss": 0.2549, + "step": 652 + }, + { + "epoch": 1.3457679444301518, + "grad_norm": 0.21329441666603088, + "learning_rate": 7.262313860252005e-06, + "loss": 0.2467, + "step": 653 + }, + { + "epoch": 1.3478260869565217, + "grad_norm": 0.20949490368366241, + "learning_rate": 7.239404352806415e-06, + "loss": 0.2569, + "step": 654 + }, + { + "epoch": 1.3498842294828917, + "grad_norm": 0.21022753417491913, + "learning_rate": 7.216494845360825e-06, + "loss": 0.2644, + "step": 655 + }, + { + "epoch": 1.3519423720092616, + "grad_norm": 0.20240676403045654, + "learning_rate": 7.193585337915236e-06, + "loss": 0.2561, + "step": 656 + }, + { + "epoch": 1.3540005145356315, + "grad_norm": 0.19892892241477966, + "learning_rate": 7.1706758304696455e-06, + "loss": 0.2564, + "step": 657 + }, + { + "epoch": 1.3560586570620017, + "grad_norm": 0.22104541957378387, + "learning_rate": 7.147766323024056e-06, + "loss": 0.2466, + "step": 658 + }, + { + "epoch": 1.3581167995883714, + "grad_norm": 0.2074560970067978, + "learning_rate": 7.124856815578466e-06, + "loss": 0.2634, + "step": 659 + }, + { + "epoch": 1.3601749421147415, + "grad_norm": 0.20596396923065186, + "learning_rate": 7.101947308132876e-06, + "loss": 0.2566, + "step": 660 + }, + { + "epoch": 1.3622330846411115, + "grad_norm": 0.2072969526052475, + "learning_rate": 7.079037800687286e-06, + "loss": 0.2603, + "step": 661 + }, + { + "epoch": 1.3642912271674814, + "grad_norm": 0.21680790185928345, + "learning_rate": 7.056128293241697e-06, + "loss": 0.2536, + "step": 662 + }, + { + "epoch": 1.3663493696938513, + "grad_norm": 0.2035921961069107, + "learning_rate": 7.0332187857961065e-06, + "loss": 0.2567, + "step": 663 + }, + { + "epoch": 1.3684075122202213, + "grad_norm": 0.21186605095863342, + "learning_rate": 7.010309278350515e-06, + "loss": 0.2575, + "step": 664 + }, + { + "epoch": 1.3704656547465912, + "grad_norm": 0.21388404071331024, + "learning_rate": 6.987399770904926e-06, + "loss": 0.2522, + "step": 665 + }, + { + "epoch": 1.3725237972729611, + "grad_norm": 0.21118783950805664, + "learning_rate": 6.964490263459336e-06, + "loss": 0.25, + "step": 666 + }, + { + "epoch": 1.374581939799331, + "grad_norm": 0.21162322163581848, + "learning_rate": 6.941580756013746e-06, + "loss": 0.253, + "step": 667 + }, + { + "epoch": 1.376640082325701, + "grad_norm": 0.21186329424381256, + "learning_rate": 6.918671248568156e-06, + "loss": 0.2589, + "step": 668 + }, + { + "epoch": 1.378698224852071, + "grad_norm": 0.21206888556480408, + "learning_rate": 6.895761741122567e-06, + "loss": 0.2629, + "step": 669 + }, + { + "epoch": 1.3807563673784409, + "grad_norm": 0.21045179665088654, + "learning_rate": 6.872852233676976e-06, + "loss": 0.2523, + "step": 670 + }, + { + "epoch": 1.382814509904811, + "grad_norm": 0.21106329560279846, + "learning_rate": 6.849942726231387e-06, + "loss": 0.2611, + "step": 671 + }, + { + "epoch": 1.3848726524311807, + "grad_norm": 0.20593757927417755, + "learning_rate": 6.827033218785797e-06, + "loss": 0.2537, + "step": 672 + }, + { + "epoch": 1.386930794957551, + "grad_norm": 0.2040368914604187, + "learning_rate": 6.804123711340207e-06, + "loss": 0.2545, + "step": 673 + }, + { + "epoch": 1.3889889374839208, + "grad_norm": 0.2148980051279068, + "learning_rate": 6.781214203894617e-06, + "loss": 0.264, + "step": 674 + }, + { + "epoch": 1.3910470800102908, + "grad_norm": 0.204456627368927, + "learning_rate": 6.758304696449028e-06, + "loss": 0.2609, + "step": 675 + }, + { + "epoch": 1.3931052225366607, + "grad_norm": 0.20230846107006073, + "learning_rate": 6.735395189003437e-06, + "loss": 0.2644, + "step": 676 + }, + { + "epoch": 1.3951633650630306, + "grad_norm": 0.205158531665802, + "learning_rate": 6.712485681557846e-06, + "loss": 0.2611, + "step": 677 + }, + { + "epoch": 1.3972215075894006, + "grad_norm": 0.21487553417682648, + "learning_rate": 6.689576174112257e-06, + "loss": 0.2492, + "step": 678 + }, + { + "epoch": 1.3992796501157705, + "grad_norm": 0.21277402341365814, + "learning_rate": 6.666666666666667e-06, + "loss": 0.2491, + "step": 679 + }, + { + "epoch": 1.4013377926421404, + "grad_norm": 0.2049219310283661, + "learning_rate": 6.643757159221077e-06, + "loss": 0.2444, + "step": 680 + }, + { + "epoch": 1.4033959351685104, + "grad_norm": 0.23122920095920563, + "learning_rate": 6.620847651775487e-06, + "loss": 0.2522, + "step": 681 + }, + { + "epoch": 1.4054540776948803, + "grad_norm": 0.2067662477493286, + "learning_rate": 6.597938144329898e-06, + "loss": 0.2583, + "step": 682 + }, + { + "epoch": 1.4075122202212502, + "grad_norm": 0.2043958306312561, + "learning_rate": 6.575028636884307e-06, + "loss": 0.2603, + "step": 683 + }, + { + "epoch": 1.4095703627476204, + "grad_norm": 0.21982067823410034, + "learning_rate": 6.552119129438718e-06, + "loss": 0.246, + "step": 684 + }, + { + "epoch": 1.41162850527399, + "grad_norm": 0.21510522067546844, + "learning_rate": 6.529209621993128e-06, + "loss": 0.2554, + "step": 685 + }, + { + "epoch": 1.4136866478003602, + "grad_norm": 0.24448052048683167, + "learning_rate": 6.506300114547538e-06, + "loss": 0.256, + "step": 686 + }, + { + "epoch": 1.4157447903267302, + "grad_norm": 0.2068399339914322, + "learning_rate": 6.483390607101948e-06, + "loss": 0.2566, + "step": 687 + }, + { + "epoch": 1.4178029328531, + "grad_norm": 0.20870736241340637, + "learning_rate": 6.460481099656359e-06, + "loss": 0.2493, + "step": 688 + }, + { + "epoch": 1.41986107537947, + "grad_norm": 0.22065278887748718, + "learning_rate": 6.437571592210768e-06, + "loss": 0.2566, + "step": 689 + }, + { + "epoch": 1.42191921790584, + "grad_norm": 0.21523869037628174, + "learning_rate": 6.414662084765179e-06, + "loss": 0.2579, + "step": 690 + }, + { + "epoch": 1.42397736043221, + "grad_norm": 0.21578392386436462, + "learning_rate": 6.391752577319588e-06, + "loss": 0.2555, + "step": 691 + }, + { + "epoch": 1.4260355029585798, + "grad_norm": 0.2096480280160904, + "learning_rate": 6.3688430698739976e-06, + "loss": 0.2534, + "step": 692 + }, + { + "epoch": 1.4280936454849498, + "grad_norm": 0.21274186670780182, + "learning_rate": 6.345933562428408e-06, + "loss": 0.2521, + "step": 693 + }, + { + "epoch": 1.4301517880113197, + "grad_norm": 0.21426336467266083, + "learning_rate": 6.323024054982818e-06, + "loss": 0.2589, + "step": 694 + }, + { + "epoch": 1.4322099305376899, + "grad_norm": 0.21294309198856354, + "learning_rate": 6.3001145475372285e-06, + "loss": 0.2615, + "step": 695 + }, + { + "epoch": 1.4342680730640596, + "grad_norm": 0.2021908164024353, + "learning_rate": 6.277205040091638e-06, + "loss": 0.2714, + "step": 696 + }, + { + "epoch": 1.4363262155904297, + "grad_norm": 0.21605439484119415, + "learning_rate": 6.254295532646049e-06, + "loss": 0.2592, + "step": 697 + }, + { + "epoch": 1.4383843581167997, + "grad_norm": 0.2154022753238678, + "learning_rate": 6.231386025200459e-06, + "loss": 0.2633, + "step": 698 + }, + { + "epoch": 1.4404425006431696, + "grad_norm": 0.2178344875574112, + "learning_rate": 6.208476517754869e-06, + "loss": 0.2685, + "step": 699 + }, + { + "epoch": 1.4425006431695395, + "grad_norm": 0.21423941850662231, + "learning_rate": 6.185567010309279e-06, + "loss": 0.2474, + "step": 700 + }, + { + "epoch": 1.4425006431695395, + "eval_loss": 0.27773216366767883, + "eval_runtime": 2423.2314, + "eval_samples_per_second": 3.208, + "eval_steps_per_second": 0.802, + "step": 700 + }, + { + "epoch": 1.4445587856959095, + "grad_norm": 0.19836685061454773, + "learning_rate": 6.1626575028636895e-06, + "loss": 0.2556, + "step": 701 + }, + { + "epoch": 1.4466169282222794, + "grad_norm": 0.21015697717666626, + "learning_rate": 6.139747995418099e-06, + "loss": 0.2605, + "step": 702 + }, + { + "epoch": 1.4486750707486493, + "grad_norm": 0.2158636897802353, + "learning_rate": 6.11683848797251e-06, + "loss": 0.252, + "step": 703 + }, + { + "epoch": 1.4507332132750193, + "grad_norm": 0.2136162966489792, + "learning_rate": 6.09392898052692e-06, + "loss": 0.2451, + "step": 704 + }, + { + "epoch": 1.4527913558013892, + "grad_norm": 0.21352505683898926, + "learning_rate": 6.0710194730813285e-06, + "loss": 0.2649, + "step": 705 + }, + { + "epoch": 1.4548494983277591, + "grad_norm": 0.22503146529197693, + "learning_rate": 6.048109965635739e-06, + "loss": 0.2604, + "step": 706 + }, + { + "epoch": 1.456907640854129, + "grad_norm": 0.2114841490983963, + "learning_rate": 6.025200458190149e-06, + "loss": 0.2547, + "step": 707 + }, + { + "epoch": 1.4589657833804992, + "grad_norm": 0.22603987157344818, + "learning_rate": 6.0022909507445594e-06, + "loss": 0.2551, + "step": 708 + }, + { + "epoch": 1.461023925906869, + "grad_norm": 0.2188458889722824, + "learning_rate": 5.979381443298969e-06, + "loss": 0.2553, + "step": 709 + }, + { + "epoch": 1.463082068433239, + "grad_norm": 0.21128129959106445, + "learning_rate": 5.95647193585338e-06, + "loss": 0.258, + "step": 710 + }, + { + "epoch": 1.465140210959609, + "grad_norm": 0.22289037704467773, + "learning_rate": 5.9335624284077895e-06, + "loss": 0.2709, + "step": 711 + }, + { + "epoch": 1.467198353485979, + "grad_norm": 0.21750517189502716, + "learning_rate": 5.9106529209622e-06, + "loss": 0.2597, + "step": 712 + }, + { + "epoch": 1.4692564960123489, + "grad_norm": 0.21022778749465942, + "learning_rate": 5.88774341351661e-06, + "loss": 0.2533, + "step": 713 + }, + { + "epoch": 1.4713146385387188, + "grad_norm": 0.21544480323791504, + "learning_rate": 5.8648339060710204e-06, + "loss": 0.255, + "step": 714 + }, + { + "epoch": 1.4733727810650887, + "grad_norm": 0.20856665074825287, + "learning_rate": 5.84192439862543e-06, + "loss": 0.2648, + "step": 715 + }, + { + "epoch": 1.4754309235914587, + "grad_norm": 0.2105010449886322, + "learning_rate": 5.819014891179841e-06, + "loss": 0.2611, + "step": 716 + }, + { + "epoch": 1.4774890661178286, + "grad_norm": 0.21749204397201538, + "learning_rate": 5.7961053837342505e-06, + "loss": 0.2551, + "step": 717 + }, + { + "epoch": 1.4795472086441985, + "grad_norm": 0.20478859543800354, + "learning_rate": 5.7731958762886594e-06, + "loss": 0.2554, + "step": 718 + }, + { + "epoch": 1.4816053511705687, + "grad_norm": 0.213475301861763, + "learning_rate": 5.75028636884307e-06, + "loss": 0.2622, + "step": 719 + }, + { + "epoch": 1.4836634936969384, + "grad_norm": 0.2008693963289261, + "learning_rate": 5.72737686139748e-06, + "loss": 0.2483, + "step": 720 + }, + { + "epoch": 1.4857216362233086, + "grad_norm": 0.19621135294437408, + "learning_rate": 5.70446735395189e-06, + "loss": 0.2553, + "step": 721 + }, + { + "epoch": 1.4877797787496785, + "grad_norm": 0.227009579539299, + "learning_rate": 5.6815578465063e-06, + "loss": 0.2529, + "step": 722 + }, + { + "epoch": 1.4898379212760484, + "grad_norm": 0.21584804356098175, + "learning_rate": 5.658648339060711e-06, + "loss": 0.2545, + "step": 723 + }, + { + "epoch": 1.4918960638024183, + "grad_norm": 0.2207970768213272, + "learning_rate": 5.6357388316151204e-06, + "loss": 0.2463, + "step": 724 + }, + { + "epoch": 1.4939542063287883, + "grad_norm": 0.22498710453510284, + "learning_rate": 5.612829324169531e-06, + "loss": 0.2593, + "step": 725 + }, + { + "epoch": 1.4960123488551582, + "grad_norm": 0.2146955132484436, + "learning_rate": 5.589919816723941e-06, + "loss": 0.2466, + "step": 726 + }, + { + "epoch": 1.4980704913815281, + "grad_norm": 0.21701963245868683, + "learning_rate": 5.567010309278351e-06, + "loss": 0.2602, + "step": 727 + }, + { + "epoch": 1.500128633907898, + "grad_norm": 0.2154153287410736, + "learning_rate": 5.544100801832761e-06, + "loss": 0.2652, + "step": 728 + }, + { + "epoch": 1.502186776434268, + "grad_norm": 0.2135971337556839, + "learning_rate": 5.521191294387172e-06, + "loss": 0.2465, + "step": 729 + }, + { + "epoch": 1.5042449189606382, + "grad_norm": 0.21887153387069702, + "learning_rate": 5.4982817869415815e-06, + "loss": 0.2553, + "step": 730 + }, + { + "epoch": 1.5063030614870079, + "grad_norm": 0.21986471116542816, + "learning_rate": 5.475372279495992e-06, + "loss": 0.2568, + "step": 731 + }, + { + "epoch": 1.508361204013378, + "grad_norm": 0.2224634885787964, + "learning_rate": 5.452462772050401e-06, + "loss": 0.2609, + "step": 732 + }, + { + "epoch": 1.5104193465397477, + "grad_norm": 0.22347122430801392, + "learning_rate": 5.429553264604811e-06, + "loss": 0.2557, + "step": 733 + }, + { + "epoch": 1.512477489066118, + "grad_norm": 0.21803030371665955, + "learning_rate": 5.406643757159221e-06, + "loss": 0.2622, + "step": 734 + }, + { + "epoch": 1.5145356315924878, + "grad_norm": 0.2078487128019333, + "learning_rate": 5.383734249713631e-06, + "loss": 0.249, + "step": 735 + }, + { + "epoch": 1.5165937741188578, + "grad_norm": 0.20815445482730865, + "learning_rate": 5.360824742268042e-06, + "loss": 0.2538, + "step": 736 + }, + { + "epoch": 1.5186519166452277, + "grad_norm": 0.21298891305923462, + "learning_rate": 5.337915234822451e-06, + "loss": 0.2532, + "step": 737 + }, + { + "epoch": 1.5207100591715976, + "grad_norm": 0.21032264828681946, + "learning_rate": 5.315005727376862e-06, + "loss": 0.2428, + "step": 738 + }, + { + "epoch": 1.5227682016979676, + "grad_norm": 0.23191553354263306, + "learning_rate": 5.292096219931272e-06, + "loss": 0.2545, + "step": 739 + }, + { + "epoch": 1.5248263442243375, + "grad_norm": 0.21168164908885956, + "learning_rate": 5.269186712485682e-06, + "loss": 0.2668, + "step": 740 + }, + { + "epoch": 1.5268844867507076, + "grad_norm": 0.2142658829689026, + "learning_rate": 5.246277205040092e-06, + "loss": 0.2589, + "step": 741 + }, + { + "epoch": 1.5289426292770774, + "grad_norm": 0.2130551040172577, + "learning_rate": 5.223367697594503e-06, + "loss": 0.248, + "step": 742 + }, + { + "epoch": 1.5310007718034475, + "grad_norm": 0.2171664535999298, + "learning_rate": 5.200458190148912e-06, + "loss": 0.2539, + "step": 743 + }, + { + "epoch": 1.5330589143298172, + "grad_norm": 0.21375024318695068, + "learning_rate": 5.177548682703323e-06, + "loss": 0.2471, + "step": 744 + }, + { + "epoch": 1.5351170568561874, + "grad_norm": 0.21037080883979797, + "learning_rate": 5.154639175257732e-06, + "loss": 0.2526, + "step": 745 + }, + { + "epoch": 1.537175199382557, + "grad_norm": 0.2103818953037262, + "learning_rate": 5.131729667812142e-06, + "loss": 0.2609, + "step": 746 + }, + { + "epoch": 1.5392333419089272, + "grad_norm": 0.21307708323001862, + "learning_rate": 5.108820160366552e-06, + "loss": 0.2606, + "step": 747 + }, + { + "epoch": 1.5412914844352972, + "grad_norm": 0.2052801549434662, + "learning_rate": 5.085910652920962e-06, + "loss": 0.2462, + "step": 748 + }, + { + "epoch": 1.543349626961667, + "grad_norm": 0.2059316784143448, + "learning_rate": 5.0630011454753726e-06, + "loss": 0.2593, + "step": 749 + }, + { + "epoch": 1.545407769488037, + "grad_norm": 0.211748406291008, + "learning_rate": 5.040091638029782e-06, + "loss": 0.2582, + "step": 750 + }, + { + "epoch": 1.547465912014407, + "grad_norm": 0.20883141458034515, + "learning_rate": 5.017182130584193e-06, + "loss": 0.251, + "step": 751 + }, + { + "epoch": 1.549524054540777, + "grad_norm": 0.21496839821338654, + "learning_rate": 4.994272623138603e-06, + "loss": 0.2486, + "step": 752 + }, + { + "epoch": 1.5515821970671468, + "grad_norm": 0.21443761885166168, + "learning_rate": 4.971363115693013e-06, + "loss": 0.2541, + "step": 753 + }, + { + "epoch": 1.553640339593517, + "grad_norm": 0.2164083868265152, + "learning_rate": 4.948453608247423e-06, + "loss": 0.2515, + "step": 754 + }, + { + "epoch": 1.5556984821198867, + "grad_norm": 0.22733120620250702, + "learning_rate": 4.9255441008018336e-06, + "loss": 0.2674, + "step": 755 + }, + { + "epoch": 1.5577566246462569, + "grad_norm": 0.21141202747821808, + "learning_rate": 4.902634593356243e-06, + "loss": 0.2586, + "step": 756 + }, + { + "epoch": 1.5598147671726266, + "grad_norm": 0.20612719655036926, + "learning_rate": 4.879725085910653e-06, + "loss": 0.2417, + "step": 757 + }, + { + "epoch": 1.5618729096989967, + "grad_norm": 0.21028929948806763, + "learning_rate": 4.856815578465064e-06, + "loss": 0.2546, + "step": 758 + }, + { + "epoch": 1.5639310522253667, + "grad_norm": 0.2196635901927948, + "learning_rate": 4.833906071019473e-06, + "loss": 0.2527, + "step": 759 + }, + { + "epoch": 1.5659891947517366, + "grad_norm": 0.20016127824783325, + "learning_rate": 4.810996563573884e-06, + "loss": 0.2629, + "step": 760 + }, + { + "epoch": 1.5680473372781065, + "grad_norm": 0.20597878098487854, + "learning_rate": 4.788087056128294e-06, + "loss": 0.2544, + "step": 761 + }, + { + "epoch": 1.5701054798044765, + "grad_norm": 0.20151163637638092, + "learning_rate": 4.7651775486827035e-06, + "loss": 0.2569, + "step": 762 + }, + { + "epoch": 1.5721636223308464, + "grad_norm": 0.21117815375328064, + "learning_rate": 4.742268041237113e-06, + "loss": 0.2602, + "step": 763 + }, + { + "epoch": 1.5742217648572163, + "grad_norm": 0.20184555649757385, + "learning_rate": 4.719358533791524e-06, + "loss": 0.2673, + "step": 764 + }, + { + "epoch": 1.5762799073835865, + "grad_norm": 0.20125100016593933, + "learning_rate": 4.6964490263459336e-06, + "loss": 0.2669, + "step": 765 + }, + { + "epoch": 1.5783380499099562, + "grad_norm": 0.2209872603416443, + "learning_rate": 4.673539518900344e-06, + "loss": 0.2569, + "step": 766 + }, + { + "epoch": 1.5803961924363263, + "grad_norm": 0.21065855026245117, + "learning_rate": 4.650630011454754e-06, + "loss": 0.2477, + "step": 767 + }, + { + "epoch": 1.582454334962696, + "grad_norm": 0.20995444059371948, + "learning_rate": 4.6277205040091645e-06, + "loss": 0.2563, + "step": 768 + }, + { + "epoch": 1.5845124774890662, + "grad_norm": 0.21762295067310333, + "learning_rate": 4.604810996563574e-06, + "loss": 0.2443, + "step": 769 + }, + { + "epoch": 1.586570620015436, + "grad_norm": 0.21741704642772675, + "learning_rate": 4.581901489117984e-06, + "loss": 0.2442, + "step": 770 + }, + { + "epoch": 1.588628762541806, + "grad_norm": 0.21586772799491882, + "learning_rate": 4.5589919816723946e-06, + "loss": 0.2484, + "step": 771 + }, + { + "epoch": 1.590686905068176, + "grad_norm": 0.22184152901172638, + "learning_rate": 4.536082474226804e-06, + "loss": 0.2522, + "step": 772 + }, + { + "epoch": 1.592745047594546, + "grad_norm": 0.22210553288459778, + "learning_rate": 4.513172966781215e-06, + "loss": 0.2552, + "step": 773 + }, + { + "epoch": 1.5948031901209159, + "grad_norm": 0.2075122743844986, + "learning_rate": 4.490263459335625e-06, + "loss": 0.263, + "step": 774 + }, + { + "epoch": 1.5968613326472858, + "grad_norm": 0.20110896229743958, + "learning_rate": 4.467353951890035e-06, + "loss": 0.248, + "step": 775 + }, + { + "epoch": 1.5989194751736557, + "grad_norm": 0.2067912071943283, + "learning_rate": 4.444444444444444e-06, + "loss": 0.2485, + "step": 776 + }, + { + "epoch": 1.6009776177000257, + "grad_norm": 0.2091452181339264, + "learning_rate": 4.421534936998855e-06, + "loss": 0.2783, + "step": 777 + }, + { + "epoch": 1.6030357602263958, + "grad_norm": 0.21414563059806824, + "learning_rate": 4.3986254295532645e-06, + "loss": 0.2502, + "step": 778 + }, + { + "epoch": 1.6050939027527655, + "grad_norm": 0.21657651662826538, + "learning_rate": 4.375715922107675e-06, + "loss": 0.2589, + "step": 779 + }, + { + "epoch": 1.6071520452791357, + "grad_norm": 0.21607093513011932, + "learning_rate": 4.352806414662085e-06, + "loss": 0.2618, + "step": 780 + }, + { + "epoch": 1.6092101878055054, + "grad_norm": 0.21846850216388702, + "learning_rate": 4.329896907216495e-06, + "loss": 0.2549, + "step": 781 + }, + { + "epoch": 1.6112683303318756, + "grad_norm": 0.21873261034488678, + "learning_rate": 4.306987399770905e-06, + "loss": 0.2448, + "step": 782 + }, + { + "epoch": 1.6133264728582455, + "grad_norm": 0.22608645260334015, + "learning_rate": 4.284077892325315e-06, + "loss": 0.2559, + "step": 783 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 0.2121078372001648, + "learning_rate": 4.2611683848797255e-06, + "loss": 0.2515, + "step": 784 + }, + { + "epoch": 1.6174427579109854, + "grad_norm": 0.227590411901474, + "learning_rate": 4.238258877434135e-06, + "loss": 0.2549, + "step": 785 + }, + { + "epoch": 1.6195009004373553, + "grad_norm": 0.201515793800354, + "learning_rate": 4.215349369988546e-06, + "loss": 0.2601, + "step": 786 + }, + { + "epoch": 1.6215590429637252, + "grad_norm": 0.21896880865097046, + "learning_rate": 4.192439862542956e-06, + "loss": 0.2564, + "step": 787 + }, + { + "epoch": 1.6236171854900951, + "grad_norm": 0.21509996056556702, + "learning_rate": 4.169530355097366e-06, + "loss": 0.2491, + "step": 788 + }, + { + "epoch": 1.6256753280164653, + "grad_norm": 0.22020220756530762, + "learning_rate": 4.146620847651776e-06, + "loss": 0.2617, + "step": 789 + }, + { + "epoch": 1.627733470542835, + "grad_norm": 0.21420395374298096, + "learning_rate": 4.123711340206186e-06, + "loss": 0.2561, + "step": 790 + }, + { + "epoch": 1.6297916130692052, + "grad_norm": 0.2270808070898056, + "learning_rate": 4.100801832760596e-06, + "loss": 0.2621, + "step": 791 + }, + { + "epoch": 1.6318497555955749, + "grad_norm": 0.2320822924375534, + "learning_rate": 4.077892325315006e-06, + "loss": 0.269, + "step": 792 + }, + { + "epoch": 1.633907898121945, + "grad_norm": 0.21081334352493286, + "learning_rate": 4.054982817869416e-06, + "loss": 0.2468, + "step": 793 + }, + { + "epoch": 1.6359660406483147, + "grad_norm": 0.2204331010580063, + "learning_rate": 4.032073310423826e-06, + "loss": 0.2578, + "step": 794 + }, + { + "epoch": 1.638024183174685, + "grad_norm": 0.20907023549079895, + "learning_rate": 4.009163802978236e-06, + "loss": 0.2523, + "step": 795 + }, + { + "epoch": 1.6400823257010548, + "grad_norm": 0.23108816146850586, + "learning_rate": 3.986254295532647e-06, + "loss": 0.2619, + "step": 796 + }, + { + "epoch": 1.6421404682274248, + "grad_norm": 0.20781853795051575, + "learning_rate": 3.9633447880870564e-06, + "loss": 0.2496, + "step": 797 + }, + { + "epoch": 1.6441986107537947, + "grad_norm": 0.20635871589183807, + "learning_rate": 3.940435280641466e-06, + "loss": 0.2527, + "step": 798 + }, + { + "epoch": 1.6462567532801646, + "grad_norm": 0.21636071801185608, + "learning_rate": 3.917525773195877e-06, + "loss": 0.2585, + "step": 799 + }, + { + "epoch": 1.6483148958065346, + "grad_norm": 0.20485584437847137, + "learning_rate": 3.8946162657502865e-06, + "loss": 0.259, + "step": 800 + }, + { + "epoch": 1.6483148958065346, + "eval_loss": 0.27598318457603455, + "eval_runtime": 2425.8429, + "eval_samples_per_second": 3.205, + "eval_steps_per_second": 0.801, + "step": 800 + }, + { + "epoch": 1.6503730383329045, + "grad_norm": 0.2005116492509842, + "learning_rate": 3.871706758304697e-06, + "loss": 0.249, + "step": 801 + }, + { + "epoch": 1.6524311808592747, + "grad_norm": 0.20868004858493805, + "learning_rate": 3.848797250859107e-06, + "loss": 0.2502, + "step": 802 + }, + { + "epoch": 1.6544893233856444, + "grad_norm": 0.2084902971982956, + "learning_rate": 3.825887743413517e-06, + "loss": 0.2504, + "step": 803 + }, + { + "epoch": 1.6565474659120145, + "grad_norm": 0.2042844593524933, + "learning_rate": 3.8029782359679268e-06, + "loss": 0.2517, + "step": 804 + }, + { + "epoch": 1.6586056084383842, + "grad_norm": 0.2120312601327896, + "learning_rate": 3.780068728522337e-06, + "loss": 0.2466, + "step": 805 + }, + { + "epoch": 1.6606637509647544, + "grad_norm": 0.21600568294525146, + "learning_rate": 3.757159221076747e-06, + "loss": 0.2521, + "step": 806 + }, + { + "epoch": 1.6627218934911243, + "grad_norm": 0.22209151089191437, + "learning_rate": 3.7342497136311573e-06, + "loss": 0.2662, + "step": 807 + }, + { + "epoch": 1.6647800360174942, + "grad_norm": 0.22267431020736694, + "learning_rate": 3.7113402061855674e-06, + "loss": 0.2498, + "step": 808 + }, + { + "epoch": 1.6668381785438642, + "grad_norm": 0.21584516763687134, + "learning_rate": 3.6884306987399776e-06, + "loss": 0.2528, + "step": 809 + }, + { + "epoch": 1.6688963210702341, + "grad_norm": 0.20465044677257538, + "learning_rate": 3.6655211912943874e-06, + "loss": 0.2569, + "step": 810 + }, + { + "epoch": 1.670954463596604, + "grad_norm": 0.21515893936157227, + "learning_rate": 3.6426116838487975e-06, + "loss": 0.2466, + "step": 811 + }, + { + "epoch": 1.673012606122974, + "grad_norm": 0.22349058091640472, + "learning_rate": 3.6197021764032077e-06, + "loss": 0.2438, + "step": 812 + }, + { + "epoch": 1.6750707486493441, + "grad_norm": 0.23108039796352386, + "learning_rate": 3.596792668957618e-06, + "loss": 0.2461, + "step": 813 + }, + { + "epoch": 1.6771288911757138, + "grad_norm": 0.22195585072040558, + "learning_rate": 3.573883161512028e-06, + "loss": 0.2496, + "step": 814 + }, + { + "epoch": 1.679187033702084, + "grad_norm": 0.22752366960048676, + "learning_rate": 3.550973654066438e-06, + "loss": 0.2495, + "step": 815 + }, + { + "epoch": 1.6812451762284537, + "grad_norm": 0.21112024784088135, + "learning_rate": 3.5280641466208484e-06, + "loss": 0.2453, + "step": 816 + }, + { + "epoch": 1.6833033187548239, + "grad_norm": 0.21209532022476196, + "learning_rate": 3.5051546391752577e-06, + "loss": 0.2466, + "step": 817 + }, + { + "epoch": 1.6853614612811936, + "grad_norm": 0.21724505722522736, + "learning_rate": 3.482245131729668e-06, + "loss": 0.2449, + "step": 818 + }, + { + "epoch": 1.6874196038075637, + "grad_norm": 0.22240252792835236, + "learning_rate": 3.459335624284078e-06, + "loss": 0.2641, + "step": 819 + }, + { + "epoch": 1.6894777463339337, + "grad_norm": 0.217677503824234, + "learning_rate": 3.436426116838488e-06, + "loss": 0.2515, + "step": 820 + }, + { + "epoch": 1.6915358888603036, + "grad_norm": 0.2246546596288681, + "learning_rate": 3.4135166093928984e-06, + "loss": 0.2515, + "step": 821 + }, + { + "epoch": 1.6935940313866735, + "grad_norm": 0.20842307806015015, + "learning_rate": 3.3906071019473085e-06, + "loss": 0.2584, + "step": 822 + }, + { + "epoch": 1.6956521739130435, + "grad_norm": 0.21404647827148438, + "learning_rate": 3.3676975945017187e-06, + "loss": 0.2465, + "step": 823 + }, + { + "epoch": 1.6977103164394134, + "grad_norm": 0.21396222710609436, + "learning_rate": 3.3447880870561285e-06, + "loss": 0.2524, + "step": 824 + }, + { + "epoch": 1.6997684589657833, + "grad_norm": 0.21428625285625458, + "learning_rate": 3.3218785796105386e-06, + "loss": 0.2554, + "step": 825 + }, + { + "epoch": 1.7018266014921535, + "grad_norm": 0.21156470477581024, + "learning_rate": 3.298969072164949e-06, + "loss": 0.2427, + "step": 826 + }, + { + "epoch": 1.7038847440185232, + "grad_norm": 0.21247607469558716, + "learning_rate": 3.276059564719359e-06, + "loss": 0.2398, + "step": 827 + }, + { + "epoch": 1.7059428865448933, + "grad_norm": 0.2125396430492401, + "learning_rate": 3.253150057273769e-06, + "loss": 0.263, + "step": 828 + }, + { + "epoch": 1.708001029071263, + "grad_norm": 0.2217744141817093, + "learning_rate": 3.2302405498281793e-06, + "loss": 0.2447, + "step": 829 + }, + { + "epoch": 1.7100591715976332, + "grad_norm": 0.21454688906669617, + "learning_rate": 3.2073310423825895e-06, + "loss": 0.251, + "step": 830 + }, + { + "epoch": 1.7121173141240031, + "grad_norm": 0.20734067261219025, + "learning_rate": 3.1844215349369988e-06, + "loss": 0.2463, + "step": 831 + }, + { + "epoch": 1.714175456650373, + "grad_norm": 0.21725836396217346, + "learning_rate": 3.161512027491409e-06, + "loss": 0.2519, + "step": 832 + }, + { + "epoch": 1.716233599176743, + "grad_norm": 0.21743294596672058, + "learning_rate": 3.138602520045819e-06, + "loss": 0.2525, + "step": 833 + }, + { + "epoch": 1.718291741703113, + "grad_norm": 0.204753115773201, + "learning_rate": 3.1156930126002293e-06, + "loss": 0.2608, + "step": 834 + }, + { + "epoch": 1.7203498842294829, + "grad_norm": 0.2242167741060257, + "learning_rate": 3.0927835051546395e-06, + "loss": 0.2567, + "step": 835 + }, + { + "epoch": 1.7224080267558528, + "grad_norm": 0.21592582762241364, + "learning_rate": 3.0698739977090496e-06, + "loss": 0.2518, + "step": 836 + }, + { + "epoch": 1.724466169282223, + "grad_norm": 0.2153058797121048, + "learning_rate": 3.04696449026346e-06, + "loss": 0.2517, + "step": 837 + }, + { + "epoch": 1.7265243118085927, + "grad_norm": 0.20874065160751343, + "learning_rate": 3.0240549828178695e-06, + "loss": 0.2567, + "step": 838 + }, + { + "epoch": 1.7285824543349628, + "grad_norm": 0.22043587267398834, + "learning_rate": 3.0011454753722797e-06, + "loss": 0.2587, + "step": 839 + }, + { + "epoch": 1.7306405968613325, + "grad_norm": 0.20169200003147125, + "learning_rate": 2.97823596792669e-06, + "loss": 0.2629, + "step": 840 + }, + { + "epoch": 1.7326987393877027, + "grad_norm": 0.21708932518959045, + "learning_rate": 2.9553264604811e-06, + "loss": 0.2604, + "step": 841 + }, + { + "epoch": 1.7347568819140724, + "grad_norm": 0.21232086420059204, + "learning_rate": 2.9324169530355102e-06, + "loss": 0.2498, + "step": 842 + }, + { + "epoch": 1.7368150244404426, + "grad_norm": 0.20930655300617218, + "learning_rate": 2.9095074455899204e-06, + "loss": 0.2584, + "step": 843 + }, + { + "epoch": 1.7388731669668125, + "grad_norm": 0.207666277885437, + "learning_rate": 2.8865979381443297e-06, + "loss": 0.2414, + "step": 844 + }, + { + "epoch": 1.7409313094931824, + "grad_norm": 0.20519839227199554, + "learning_rate": 2.86368843069874e-06, + "loss": 0.2524, + "step": 845 + }, + { + "epoch": 1.7429894520195524, + "grad_norm": 0.22689610719680786, + "learning_rate": 2.84077892325315e-06, + "loss": 0.2502, + "step": 846 + }, + { + "epoch": 1.7450475945459223, + "grad_norm": 0.22423967719078064, + "learning_rate": 2.8178694158075602e-06, + "loss": 0.2432, + "step": 847 + }, + { + "epoch": 1.7471057370722922, + "grad_norm": 0.21444083750247955, + "learning_rate": 2.7949599083619704e-06, + "loss": 0.2501, + "step": 848 + }, + { + "epoch": 1.7491638795986622, + "grad_norm": 0.20746010541915894, + "learning_rate": 2.7720504009163806e-06, + "loss": 0.2585, + "step": 849 + }, + { + "epoch": 1.7512220221250323, + "grad_norm": 0.22796258330345154, + "learning_rate": 2.7491408934707907e-06, + "loss": 0.2482, + "step": 850 + }, + { + "epoch": 1.753280164651402, + "grad_norm": 0.21120622754096985, + "learning_rate": 2.7262313860252005e-06, + "loss": 0.2617, + "step": 851 + }, + { + "epoch": 1.7553383071777722, + "grad_norm": 0.21528108417987823, + "learning_rate": 2.7033218785796106e-06, + "loss": 0.2564, + "step": 852 + }, + { + "epoch": 1.7573964497041419, + "grad_norm": 0.2123376727104187, + "learning_rate": 2.680412371134021e-06, + "loss": 0.2505, + "step": 853 + }, + { + "epoch": 1.759454592230512, + "grad_norm": 0.2255619317293167, + "learning_rate": 2.657502863688431e-06, + "loss": 0.2598, + "step": 854 + }, + { + "epoch": 1.7615127347568817, + "grad_norm": 0.21333782374858856, + "learning_rate": 2.634593356242841e-06, + "loss": 0.2456, + "step": 855 + }, + { + "epoch": 1.763570877283252, + "grad_norm": 0.20801705121994019, + "learning_rate": 2.6116838487972513e-06, + "loss": 0.249, + "step": 856 + }, + { + "epoch": 1.7656290198096218, + "grad_norm": 0.2295520156621933, + "learning_rate": 2.5887743413516615e-06, + "loss": 0.2711, + "step": 857 + }, + { + "epoch": 1.7676871623359918, + "grad_norm": 0.21109919250011444, + "learning_rate": 2.565864833906071e-06, + "loss": 0.2484, + "step": 858 + }, + { + "epoch": 1.7697453048623617, + "grad_norm": 0.2123642861843109, + "learning_rate": 2.542955326460481e-06, + "loss": 0.2545, + "step": 859 + }, + { + "epoch": 1.7718034473887316, + "grad_norm": 0.20756429433822632, + "learning_rate": 2.520045819014891e-06, + "loss": 0.2479, + "step": 860 + }, + { + "epoch": 1.7738615899151016, + "grad_norm": 0.21249566972255707, + "learning_rate": 2.4971363115693013e-06, + "loss": 0.2473, + "step": 861 + }, + { + "epoch": 1.7759197324414715, + "grad_norm": 0.22438718378543854, + "learning_rate": 2.4742268041237115e-06, + "loss": 0.2549, + "step": 862 + }, + { + "epoch": 1.7779778749678417, + "grad_norm": 0.22067435085773468, + "learning_rate": 2.4513172966781217e-06, + "loss": 0.26, + "step": 863 + }, + { + "epoch": 1.7800360174942114, + "grad_norm": 0.2168402522802353, + "learning_rate": 2.428407789232532e-06, + "loss": 0.2541, + "step": 864 + }, + { + "epoch": 1.7820941600205815, + "grad_norm": 0.2113119214773178, + "learning_rate": 2.405498281786942e-06, + "loss": 0.2583, + "step": 865 + }, + { + "epoch": 1.7841523025469512, + "grad_norm": 0.21283333003520966, + "learning_rate": 2.3825887743413517e-06, + "loss": 0.2564, + "step": 866 + }, + { + "epoch": 1.7862104450733214, + "grad_norm": 0.21427619457244873, + "learning_rate": 2.359679266895762e-06, + "loss": 0.2533, + "step": 867 + }, + { + "epoch": 1.7882685875996913, + "grad_norm": 0.20976261794567108, + "learning_rate": 2.336769759450172e-06, + "loss": 0.2662, + "step": 868 + }, + { + "epoch": 1.7903267301260612, + "grad_norm": 0.22446084022521973, + "learning_rate": 2.3138602520045822e-06, + "loss": 0.2566, + "step": 869 + }, + { + "epoch": 1.7923848726524312, + "grad_norm": 0.21603813767433167, + "learning_rate": 2.290950744558992e-06, + "loss": 0.2427, + "step": 870 + }, + { + "epoch": 1.7944430151788011, + "grad_norm": 0.21606098115444183, + "learning_rate": 2.268041237113402e-06, + "loss": 0.2503, + "step": 871 + }, + { + "epoch": 1.796501157705171, + "grad_norm": 0.20895624160766602, + "learning_rate": 2.2451317296678123e-06, + "loss": 0.2423, + "step": 872 + }, + { + "epoch": 1.798559300231541, + "grad_norm": 0.21321886777877808, + "learning_rate": 2.222222222222222e-06, + "loss": 0.2514, + "step": 873 + }, + { + "epoch": 1.8006174427579111, + "grad_norm": 0.2091333568096161, + "learning_rate": 2.1993127147766322e-06, + "loss": 0.2593, + "step": 874 + }, + { + "epoch": 1.8026755852842808, + "grad_norm": 0.2109704166650772, + "learning_rate": 2.1764032073310424e-06, + "loss": 0.2626, + "step": 875 + }, + { + "epoch": 1.804733727810651, + "grad_norm": 0.21323198080062866, + "learning_rate": 2.1534936998854526e-06, + "loss": 0.2517, + "step": 876 + }, + { + "epoch": 1.8067918703370207, + "grad_norm": 0.21177341043949127, + "learning_rate": 2.1305841924398628e-06, + "loss": 0.2589, + "step": 877 + }, + { + "epoch": 1.8088500128633909, + "grad_norm": 0.21436013281345367, + "learning_rate": 2.107674684994273e-06, + "loss": 0.2498, + "step": 878 + }, + { + "epoch": 1.8109081553897606, + "grad_norm": 0.21496744453907013, + "learning_rate": 2.084765177548683e-06, + "loss": 0.2595, + "step": 879 + }, + { + "epoch": 1.8129662979161307, + "grad_norm": 0.21034789085388184, + "learning_rate": 2.061855670103093e-06, + "loss": 0.2494, + "step": 880 + }, + { + "epoch": 1.8150244404425007, + "grad_norm": 0.20836222171783447, + "learning_rate": 2.038946162657503e-06, + "loss": 0.2526, + "step": 881 + }, + { + "epoch": 1.8170825829688706, + "grad_norm": 0.21801823377609253, + "learning_rate": 2.016036655211913e-06, + "loss": 0.2594, + "step": 882 + }, + { + "epoch": 1.8191407254952405, + "grad_norm": 0.20607352256774902, + "learning_rate": 1.9931271477663233e-06, + "loss": 0.2636, + "step": 883 + }, + { + "epoch": 1.8211988680216105, + "grad_norm": 0.2141195684671402, + "learning_rate": 1.970217640320733e-06, + "loss": 0.2572, + "step": 884 + }, + { + "epoch": 1.8232570105479804, + "grad_norm": 0.2243940681219101, + "learning_rate": 1.9473081328751433e-06, + "loss": 0.2575, + "step": 885 + }, + { + "epoch": 1.8253151530743503, + "grad_norm": 0.20857423543930054, + "learning_rate": 1.9243986254295534e-06, + "loss": 0.2464, + "step": 886 + }, + { + "epoch": 1.8273732956007205, + "grad_norm": 0.19861185550689697, + "learning_rate": 1.9014891179839634e-06, + "loss": 0.2552, + "step": 887 + }, + { + "epoch": 1.8294314381270902, + "grad_norm": 0.2101699262857437, + "learning_rate": 1.8785796105383736e-06, + "loss": 0.2716, + "step": 888 + }, + { + "epoch": 1.8314895806534603, + "grad_norm": 0.2184407114982605, + "learning_rate": 1.8556701030927837e-06, + "loss": 0.2585, + "step": 889 + }, + { + "epoch": 1.83354772317983, + "grad_norm": 0.20467938482761383, + "learning_rate": 1.8327605956471937e-06, + "loss": 0.2548, + "step": 890 + }, + { + "epoch": 1.8356058657062002, + "grad_norm": 0.21270884573459625, + "learning_rate": 1.8098510882016038e-06, + "loss": 0.2663, + "step": 891 + }, + { + "epoch": 1.8376640082325701, + "grad_norm": 0.2190205454826355, + "learning_rate": 1.786941580756014e-06, + "loss": 0.2529, + "step": 892 + }, + { + "epoch": 1.83972215075894, + "grad_norm": 0.21621330082416534, + "learning_rate": 1.7640320733104242e-06, + "loss": 0.2587, + "step": 893 + }, + { + "epoch": 1.84178029328531, + "grad_norm": 0.22347432374954224, + "learning_rate": 1.741122565864834e-06, + "loss": 0.2547, + "step": 894 + }, + { + "epoch": 1.84383843581168, + "grad_norm": 0.20814360678195953, + "learning_rate": 1.718213058419244e-06, + "loss": 0.2431, + "step": 895 + }, + { + "epoch": 1.8458965783380499, + "grad_norm": 0.2169455736875534, + "learning_rate": 1.6953035509736543e-06, + "loss": 0.2471, + "step": 896 + }, + { + "epoch": 1.8479547208644198, + "grad_norm": 0.20700973272323608, + "learning_rate": 1.6723940435280642e-06, + "loss": 0.2532, + "step": 897 + }, + { + "epoch": 1.85001286339079, + "grad_norm": 0.2104254812002182, + "learning_rate": 1.6494845360824744e-06, + "loss": 0.2417, + "step": 898 + }, + { + "epoch": 1.8520710059171597, + "grad_norm": 0.2133847326040268, + "learning_rate": 1.6265750286368846e-06, + "loss": 0.2499, + "step": 899 + }, + { + "epoch": 1.8541291484435298, + "grad_norm": 0.21578392386436462, + "learning_rate": 1.6036655211912947e-06, + "loss": 0.252, + "step": 900 + }, + { + "epoch": 1.8541291484435298, + "eval_loss": 0.2747899889945984, + "eval_runtime": 2422.9271, + "eval_samples_per_second": 3.209, + "eval_steps_per_second": 0.802, + "step": 900 + } + ], + "logging_steps": 1, + "max_steps": 970, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.9742128235888476e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/training_args.bin b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5999c7ee9dd10ee9076d748e4757533e635fa832 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee55a11f5a306eb7c39b536fdfe2459bc279e468da50f6adda478c4deffcb812 +size 5688 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/README.md b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0d77d70fdc5c829c8889cb85828736b7eb9714 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/codegemma-7b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/adapter_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e841602c6a59fc7b085ac647af4d4c312445d261 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/codegemma-7b-it", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "q_proj", + "down_proj", + "gate_proj", + "k_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/adapter_model.safetensors b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..de27ccddf1ad082a43332935be888e13539a266a --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b9a683aab0635490d46fb72d7c531a3a443ffe4435c539ad2d7f933ae1397d5 +size 800116456 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/optimizer.pt b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9994b7b0c056a7818aa969f7bb7c0e399a3aab8a --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f88ddee65a49383d75be1568e455163fc13e9d144c1b8649e58c3633d08e6ba4 +size 406743860 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/rng_state.pth b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..9e592b50af4d30623a5c177e1ac193d73b203039 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c4aa5f142e26419c0313b75006c5ad7308aabd3eb589818b54e875dbea3034b +size 14244 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/scheduler.pt b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c112b660babc3a68bb4b477c9d243053cd7196d --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15ae2b9c9fb88b9e8fcdcb8e7370c89f7a7772c68bd30fdc0c0f6cdcb56763c8 +size 1064 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/special_tokens_map.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/tokenizer.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..45a5e23f54141c5f4f97a8d58f3ffadc28e287ba --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d964a2c8346d40f95791533eae48730d5f163c2e65fd16333560fd3e661df318 +size 34362915 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/tokenizer.model b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..71a98ce40269d847e58957e1e070d9ae8eb184af --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:583f2ebd2a1936009b7da991ea255504db68c7a9713a78673d1335a87098966c +size 4241023 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/tokenizer_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9b9b1b4acdd4afcedae39d1cf6f0bc7ef7d9910f --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/tokenizer_config.json @@ -0,0 +1,2011 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "<|file_separator|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "left", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/trainer_state.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..244bb72f459c14f5fd5aea4926bc0522c1e46305 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/trainer_state.json @@ -0,0 +1,6896 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9981991252894264, + "eval_steps": 100, + "global_step": 970, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0020581425263699513, + "grad_norm": 11.994463920593262, + "learning_rate": 2.061855670103093e-07, + "loss": 2.91, + "step": 1 + }, + { + "epoch": 0.004116285052739903, + "grad_norm": 11.769092559814453, + "learning_rate": 4.123711340206186e-07, + "loss": 2.8686, + "step": 2 + }, + { + "epoch": 0.0061744275791098535, + "grad_norm": 13.05551815032959, + "learning_rate": 6.185567010309279e-07, + "loss": 3.0286, + "step": 3 + }, + { + "epoch": 0.008232570105479805, + "grad_norm": 12.334521293640137, + "learning_rate": 8.247422680412372e-07, + "loss": 2.904, + "step": 4 + }, + { + "epoch": 0.010290712631849755, + "grad_norm": 12.075353622436523, + "learning_rate": 1.0309278350515464e-06, + "loss": 2.8991, + "step": 5 + }, + { + "epoch": 0.012348855158219707, + "grad_norm": 11.86032485961914, + "learning_rate": 1.2371134020618557e-06, + "loss": 3.0007, + "step": 6 + }, + { + "epoch": 0.014406997684589657, + "grad_norm": 10.10457992553711, + "learning_rate": 1.4432989690721649e-06, + "loss": 2.8493, + "step": 7 + }, + { + "epoch": 0.01646514021095961, + "grad_norm": 8.56408405303955, + "learning_rate": 1.6494845360824744e-06, + "loss": 2.9573, + "step": 8 + }, + { + "epoch": 0.01852328273732956, + "grad_norm": 6.307392120361328, + "learning_rate": 1.8556701030927837e-06, + "loss": 2.9507, + "step": 9 + }, + { + "epoch": 0.02058142526369951, + "grad_norm": 4.276430130004883, + "learning_rate": 2.061855670103093e-06, + "loss": 2.8988, + "step": 10 + }, + { + "epoch": 0.022639567790069464, + "grad_norm": 2.5912015438079834, + "learning_rate": 2.268041237113402e-06, + "loss": 2.9926, + "step": 11 + }, + { + "epoch": 0.024697710316439414, + "grad_norm": 2.018446207046509, + "learning_rate": 2.4742268041237115e-06, + "loss": 2.9874, + "step": 12 + }, + { + "epoch": 0.026755852842809364, + "grad_norm": 1.8558588027954102, + "learning_rate": 2.680412371134021e-06, + "loss": 2.8608, + "step": 13 + }, + { + "epoch": 0.028813995369179314, + "grad_norm": 1.9658265113830566, + "learning_rate": 2.8865979381443297e-06, + "loss": 2.8596, + "step": 14 + }, + { + "epoch": 0.030872137895549268, + "grad_norm": 1.872044563293457, + "learning_rate": 3.0927835051546395e-06, + "loss": 2.8836, + "step": 15 + }, + { + "epoch": 0.03293028042191922, + "grad_norm": 1.8884096145629883, + "learning_rate": 3.298969072164949e-06, + "loss": 2.9383, + "step": 16 + }, + { + "epoch": 0.03498842294828917, + "grad_norm": 1.8795744180679321, + "learning_rate": 3.5051546391752577e-06, + "loss": 2.883, + "step": 17 + }, + { + "epoch": 0.03704656547465912, + "grad_norm": 1.783678412437439, + "learning_rate": 3.7113402061855674e-06, + "loss": 2.8019, + "step": 18 + }, + { + "epoch": 0.039104708001029075, + "grad_norm": 1.820617914199829, + "learning_rate": 3.917525773195877e-06, + "loss": 2.8813, + "step": 19 + }, + { + "epoch": 0.04116285052739902, + "grad_norm": 1.8188731670379639, + "learning_rate": 4.123711340206186e-06, + "loss": 2.8401, + "step": 20 + }, + { + "epoch": 0.043220993053768975, + "grad_norm": 1.7305251359939575, + "learning_rate": 4.329896907216495e-06, + "loss": 2.7478, + "step": 21 + }, + { + "epoch": 0.04527913558013893, + "grad_norm": 1.7014551162719727, + "learning_rate": 4.536082474226804e-06, + "loss": 2.7356, + "step": 22 + }, + { + "epoch": 0.047337278106508875, + "grad_norm": 1.677381157875061, + "learning_rate": 4.742268041237113e-06, + "loss": 2.7593, + "step": 23 + }, + { + "epoch": 0.04939542063287883, + "grad_norm": 1.628554344177246, + "learning_rate": 4.948453608247423e-06, + "loss": 2.7689, + "step": 24 + }, + { + "epoch": 0.051453563159248775, + "grad_norm": 1.4968128204345703, + "learning_rate": 5.154639175257732e-06, + "loss": 2.6613, + "step": 25 + }, + { + "epoch": 0.05351170568561873, + "grad_norm": 1.4734832048416138, + "learning_rate": 5.360824742268042e-06, + "loss": 2.7095, + "step": 26 + }, + { + "epoch": 0.05556984821198868, + "grad_norm": 1.3745571374893188, + "learning_rate": 5.567010309278351e-06, + "loss": 2.655, + "step": 27 + }, + { + "epoch": 0.05762799073835863, + "grad_norm": 1.3381729125976562, + "learning_rate": 5.7731958762886594e-06, + "loss": 2.55, + "step": 28 + }, + { + "epoch": 0.05968613326472858, + "grad_norm": 1.3388073444366455, + "learning_rate": 5.979381443298969e-06, + "loss": 2.5219, + "step": 29 + }, + { + "epoch": 0.061744275791098535, + "grad_norm": 1.317008376121521, + "learning_rate": 6.185567010309279e-06, + "loss": 2.4491, + "step": 30 + }, + { + "epoch": 0.06380241831746848, + "grad_norm": 1.3210794925689697, + "learning_rate": 6.391752577319588e-06, + "loss": 2.4358, + "step": 31 + }, + { + "epoch": 0.06586056084383844, + "grad_norm": 1.182519555091858, + "learning_rate": 6.597938144329898e-06, + "loss": 2.4514, + "step": 32 + }, + { + "epoch": 0.06791870337020839, + "grad_norm": 1.2238099575042725, + "learning_rate": 6.804123711340207e-06, + "loss": 2.442, + "step": 33 + }, + { + "epoch": 0.06997684589657834, + "grad_norm": 1.1793314218521118, + "learning_rate": 7.010309278350515e-06, + "loss": 2.3864, + "step": 34 + }, + { + "epoch": 0.0720349884229483, + "grad_norm": 1.1983020305633545, + "learning_rate": 7.216494845360825e-06, + "loss": 2.3796, + "step": 35 + }, + { + "epoch": 0.07409313094931824, + "grad_norm": 1.2189652919769287, + "learning_rate": 7.422680412371135e-06, + "loss": 2.4152, + "step": 36 + }, + { + "epoch": 0.07615127347568819, + "grad_norm": 1.14923095703125, + "learning_rate": 7.628865979381444e-06, + "loss": 2.3298, + "step": 37 + }, + { + "epoch": 0.07820941600205815, + "grad_norm": 1.147013545036316, + "learning_rate": 7.835051546391754e-06, + "loss": 2.2488, + "step": 38 + }, + { + "epoch": 0.0802675585284281, + "grad_norm": 1.133981466293335, + "learning_rate": 8.041237113402063e-06, + "loss": 2.1825, + "step": 39 + }, + { + "epoch": 0.08232570105479804, + "grad_norm": 1.1686867475509644, + "learning_rate": 8.247422680412371e-06, + "loss": 2.2282, + "step": 40 + }, + { + "epoch": 0.084383843581168, + "grad_norm": 1.131690502166748, + "learning_rate": 8.453608247422681e-06, + "loss": 2.0962, + "step": 41 + }, + { + "epoch": 0.08644198610753795, + "grad_norm": 1.1626195907592773, + "learning_rate": 8.65979381443299e-06, + "loss": 2.1161, + "step": 42 + }, + { + "epoch": 0.0885001286339079, + "grad_norm": 1.1508581638336182, + "learning_rate": 8.865979381443299e-06, + "loss": 1.9856, + "step": 43 + }, + { + "epoch": 0.09055827116027786, + "grad_norm": 1.2286733388900757, + "learning_rate": 9.072164948453609e-06, + "loss": 2.076, + "step": 44 + }, + { + "epoch": 0.0926164136866478, + "grad_norm": 1.82068932056427, + "learning_rate": 9.278350515463918e-06, + "loss": 1.9995, + "step": 45 + }, + { + "epoch": 0.09467455621301775, + "grad_norm": 2.079101324081421, + "learning_rate": 9.484536082474226e-06, + "loss": 1.9601, + "step": 46 + }, + { + "epoch": 0.0967326987393877, + "grad_norm": 1.1209226846694946, + "learning_rate": 9.690721649484536e-06, + "loss": 1.9346, + "step": 47 + }, + { + "epoch": 0.09879084126575766, + "grad_norm": 1.0579711198806763, + "learning_rate": 9.896907216494846e-06, + "loss": 1.8764, + "step": 48 + }, + { + "epoch": 0.1008489837921276, + "grad_norm": 1.0434011220932007, + "learning_rate": 1.0103092783505156e-05, + "loss": 1.8483, + "step": 49 + }, + { + "epoch": 0.10290712631849755, + "grad_norm": 1.0089991092681885, + "learning_rate": 1.0309278350515464e-05, + "loss": 1.8018, + "step": 50 + }, + { + "epoch": 0.10496526884486751, + "grad_norm": 1.0117324590682983, + "learning_rate": 1.0515463917525775e-05, + "loss": 1.8003, + "step": 51 + }, + { + "epoch": 0.10702341137123746, + "grad_norm": 1.0006697177886963, + "learning_rate": 1.0721649484536083e-05, + "loss": 1.7482, + "step": 52 + }, + { + "epoch": 0.1090815538976074, + "grad_norm": 2.1164329051971436, + "learning_rate": 1.0927835051546391e-05, + "loss": 1.7363, + "step": 53 + }, + { + "epoch": 0.11113969642397736, + "grad_norm": 0.9573502540588379, + "learning_rate": 1.1134020618556703e-05, + "loss": 1.661, + "step": 54 + }, + { + "epoch": 0.11319783895034731, + "grad_norm": 1.0059764385223389, + "learning_rate": 1.134020618556701e-05, + "loss": 1.6979, + "step": 55 + }, + { + "epoch": 0.11525598147671726, + "grad_norm": 0.9719656109809875, + "learning_rate": 1.1546391752577319e-05, + "loss": 1.6318, + "step": 56 + }, + { + "epoch": 0.11731412400308722, + "grad_norm": 1.0024539232254028, + "learning_rate": 1.175257731958763e-05, + "loss": 1.6283, + "step": 57 + }, + { + "epoch": 0.11937226652945716, + "grad_norm": 0.9772456288337708, + "learning_rate": 1.1958762886597938e-05, + "loss": 1.5611, + "step": 58 + }, + { + "epoch": 0.12143040905582711, + "grad_norm": 0.9947625994682312, + "learning_rate": 1.2164948453608248e-05, + "loss": 1.6073, + "step": 59 + }, + { + "epoch": 0.12348855158219707, + "grad_norm": 2.112889051437378, + "learning_rate": 1.2371134020618558e-05, + "loss": 1.6208, + "step": 60 + }, + { + "epoch": 0.12554669410856703, + "grad_norm": 1.0515345335006714, + "learning_rate": 1.2577319587628866e-05, + "loss": 1.569, + "step": 61 + }, + { + "epoch": 0.12760483663493696, + "grad_norm": 1.0782145261764526, + "learning_rate": 1.2783505154639176e-05, + "loss": 1.5097, + "step": 62 + }, + { + "epoch": 0.12966297916130692, + "grad_norm": 1.154104232788086, + "learning_rate": 1.2989690721649485e-05, + "loss": 1.5472, + "step": 63 + }, + { + "epoch": 0.13172112168767688, + "grad_norm": 1.1614656448364258, + "learning_rate": 1.3195876288659795e-05, + "loss": 1.4833, + "step": 64 + }, + { + "epoch": 0.13377926421404682, + "grad_norm": 1.1720911264419556, + "learning_rate": 1.3402061855670103e-05, + "loss": 1.4644, + "step": 65 + }, + { + "epoch": 0.13583740674041678, + "grad_norm": 1.8903896808624268, + "learning_rate": 1.3608247422680415e-05, + "loss": 1.4286, + "step": 66 + }, + { + "epoch": 0.13789554926678674, + "grad_norm": 1.2675013542175293, + "learning_rate": 1.3814432989690723e-05, + "loss": 1.416, + "step": 67 + }, + { + "epoch": 0.13995369179315667, + "grad_norm": 1.266434907913208, + "learning_rate": 1.402061855670103e-05, + "loss": 1.3171, + "step": 68 + }, + { + "epoch": 0.14201183431952663, + "grad_norm": 1.3408889770507812, + "learning_rate": 1.4226804123711342e-05, + "loss": 1.3396, + "step": 69 + }, + { + "epoch": 0.1440699768458966, + "grad_norm": 1.3862446546554565, + "learning_rate": 1.443298969072165e-05, + "loss": 1.2642, + "step": 70 + }, + { + "epoch": 0.14612811937226652, + "grad_norm": 2.110553026199341, + "learning_rate": 1.4639175257731958e-05, + "loss": 1.2593, + "step": 71 + }, + { + "epoch": 0.14818626189863648, + "grad_norm": 1.7017499208450317, + "learning_rate": 1.484536082474227e-05, + "loss": 1.24, + "step": 72 + }, + { + "epoch": 0.15024440442500644, + "grad_norm": 1.9851700067520142, + "learning_rate": 1.5051546391752578e-05, + "loss": 1.2313, + "step": 73 + }, + { + "epoch": 0.15230254695137638, + "grad_norm": 2.009608030319214, + "learning_rate": 1.5257731958762888e-05, + "loss": 1.1281, + "step": 74 + }, + { + "epoch": 0.15436068947774634, + "grad_norm": 2.7587485313415527, + "learning_rate": 1.5463917525773197e-05, + "loss": 1.1248, + "step": 75 + }, + { + "epoch": 0.1564188320041163, + "grad_norm": 2.780954599380493, + "learning_rate": 1.5670103092783507e-05, + "loss": 1.0797, + "step": 76 + }, + { + "epoch": 0.15847697453048623, + "grad_norm": 3.1470866203308105, + "learning_rate": 1.5876288659793813e-05, + "loss": 1.0064, + "step": 77 + }, + { + "epoch": 0.1605351170568562, + "grad_norm": 4.653595447540283, + "learning_rate": 1.6082474226804127e-05, + "loss": 0.9219, + "step": 78 + }, + { + "epoch": 0.16259325958322615, + "grad_norm": 4.157363414764404, + "learning_rate": 1.6288659793814433e-05, + "loss": 0.8709, + "step": 79 + }, + { + "epoch": 0.16465140210959608, + "grad_norm": 4.5814924240112305, + "learning_rate": 1.6494845360824743e-05, + "loss": 0.7693, + "step": 80 + }, + { + "epoch": 0.16670954463596604, + "grad_norm": 5.096139907836914, + "learning_rate": 1.6701030927835052e-05, + "loss": 0.6868, + "step": 81 + }, + { + "epoch": 0.168767687162336, + "grad_norm": 4.858880519866943, + "learning_rate": 1.6907216494845362e-05, + "loss": 0.5971, + "step": 82 + }, + { + "epoch": 0.17082582968870594, + "grad_norm": 4.42564582824707, + "learning_rate": 1.7113402061855672e-05, + "loss": 0.4719, + "step": 83 + }, + { + "epoch": 0.1728839722150759, + "grad_norm": 7.720851421356201, + "learning_rate": 1.731958762886598e-05, + "loss": 0.3943, + "step": 84 + }, + { + "epoch": 0.17494211474144586, + "grad_norm": 0.41923192143440247, + "learning_rate": 1.752577319587629e-05, + "loss": 0.3635, + "step": 85 + }, + { + "epoch": 0.1770002572678158, + "grad_norm": 0.2771846354007721, + "learning_rate": 1.7731958762886598e-05, + "loss": 0.3597, + "step": 86 + }, + { + "epoch": 0.17905839979418575, + "grad_norm": 0.24761857092380524, + "learning_rate": 1.793814432989691e-05, + "loss": 0.3735, + "step": 87 + }, + { + "epoch": 0.1811165423205557, + "grad_norm": 0.23277048766613007, + "learning_rate": 1.8144329896907217e-05, + "loss": 0.3643, + "step": 88 + }, + { + "epoch": 0.18317468484692565, + "grad_norm": 0.22931228578090668, + "learning_rate": 1.8350515463917527e-05, + "loss": 0.3519, + "step": 89 + }, + { + "epoch": 0.1852328273732956, + "grad_norm": 0.20750615000724792, + "learning_rate": 1.8556701030927837e-05, + "loss": 0.3431, + "step": 90 + }, + { + "epoch": 0.18729096989966554, + "grad_norm": 0.2080322951078415, + "learning_rate": 1.8762886597938147e-05, + "loss": 0.3632, + "step": 91 + }, + { + "epoch": 0.1893491124260355, + "grad_norm": 0.20186181366443634, + "learning_rate": 1.8969072164948453e-05, + "loss": 0.3492, + "step": 92 + }, + { + "epoch": 0.19140725495240546, + "grad_norm": 0.19172786176204681, + "learning_rate": 1.9175257731958766e-05, + "loss": 0.3552, + "step": 93 + }, + { + "epoch": 0.1934653974787754, + "grad_norm": 0.1747850626707077, + "learning_rate": 1.9381443298969072e-05, + "loss": 0.3355, + "step": 94 + }, + { + "epoch": 0.19552354000514535, + "grad_norm": 0.196411594748497, + "learning_rate": 1.9587628865979382e-05, + "loss": 0.3271, + "step": 95 + }, + { + "epoch": 0.1975816825315153, + "grad_norm": 0.20063228905200958, + "learning_rate": 1.9793814432989692e-05, + "loss": 0.3351, + "step": 96 + }, + { + "epoch": 0.19963982505788525, + "grad_norm": 0.19240939617156982, + "learning_rate": 2e-05, + "loss": 0.3266, + "step": 97 + }, + { + "epoch": 0.2016979675842552, + "grad_norm": 0.18206572532653809, + "learning_rate": 1.997709049255441e-05, + "loss": 0.3393, + "step": 98 + }, + { + "epoch": 0.20375611011062517, + "grad_norm": 0.20384562015533447, + "learning_rate": 1.9954180985108823e-05, + "loss": 0.3395, + "step": 99 + }, + { + "epoch": 0.2058142526369951, + "grad_norm": 0.19944581389427185, + "learning_rate": 1.9931271477663232e-05, + "loss": 0.3268, + "step": 100 + }, + { + "epoch": 0.2058142526369951, + "eval_loss": 0.3456890285015106, + "eval_runtime": 2114.0178, + "eval_samples_per_second": 3.677, + "eval_steps_per_second": 0.92, + "step": 100 + }, + { + "epoch": 0.20787239516336506, + "grad_norm": 0.17743557691574097, + "learning_rate": 1.990836197021764e-05, + "loss": 0.3439, + "step": 101 + }, + { + "epoch": 0.20993053768973502, + "grad_norm": 0.18746449053287506, + "learning_rate": 1.9885452462772053e-05, + "loss": 0.326, + "step": 102 + }, + { + "epoch": 0.21198868021610495, + "grad_norm": 0.18555815517902374, + "learning_rate": 1.9862542955326462e-05, + "loss": 0.3337, + "step": 103 + }, + { + "epoch": 0.2140468227424749, + "grad_norm": 0.16591575741767883, + "learning_rate": 1.9839633447880874e-05, + "loss": 0.3121, + "step": 104 + }, + { + "epoch": 0.21610496526884487, + "grad_norm": 0.1621987372636795, + "learning_rate": 1.9816723940435283e-05, + "loss": 0.3287, + "step": 105 + }, + { + "epoch": 0.2181631077952148, + "grad_norm": 0.1614532470703125, + "learning_rate": 1.9793814432989692e-05, + "loss": 0.3306, + "step": 106 + }, + { + "epoch": 0.22022125032158477, + "grad_norm": 0.17993387579917908, + "learning_rate": 1.9770904925544104e-05, + "loss": 0.3341, + "step": 107 + }, + { + "epoch": 0.22227939284795473, + "grad_norm": 0.1550011783838272, + "learning_rate": 1.9747995418098513e-05, + "loss": 0.3197, + "step": 108 + }, + { + "epoch": 0.22433753537432466, + "grad_norm": 0.18471524119377136, + "learning_rate": 1.9725085910652922e-05, + "loss": 0.3285, + "step": 109 + }, + { + "epoch": 0.22639567790069462, + "grad_norm": 0.15604373812675476, + "learning_rate": 1.9702176403207334e-05, + "loss": 0.3298, + "step": 110 + }, + { + "epoch": 0.22845382042706458, + "grad_norm": 0.1682298630475998, + "learning_rate": 1.9679266895761743e-05, + "loss": 0.3343, + "step": 111 + }, + { + "epoch": 0.2305119629534345, + "grad_norm": 0.14933635294437408, + "learning_rate": 1.9656357388316152e-05, + "loss": 0.3134, + "step": 112 + }, + { + "epoch": 0.23257010547980447, + "grad_norm": 0.14892347157001495, + "learning_rate": 1.963344788087056e-05, + "loss": 0.3154, + "step": 113 + }, + { + "epoch": 0.23462824800617443, + "grad_norm": 0.1577889323234558, + "learning_rate": 1.9610538373424973e-05, + "loss": 0.3122, + "step": 114 + }, + { + "epoch": 0.23668639053254437, + "grad_norm": 0.16482344269752502, + "learning_rate": 1.9587628865979382e-05, + "loss": 0.3193, + "step": 115 + }, + { + "epoch": 0.23874453305891433, + "grad_norm": 0.15328913927078247, + "learning_rate": 1.956471935853379e-05, + "loss": 0.3217, + "step": 116 + }, + { + "epoch": 0.2408026755852843, + "grad_norm": 0.16140656173229218, + "learning_rate": 1.9541809851088203e-05, + "loss": 0.318, + "step": 117 + }, + { + "epoch": 0.24286081811165422, + "grad_norm": 0.15448373556137085, + "learning_rate": 1.9518900343642612e-05, + "loss": 0.3205, + "step": 118 + }, + { + "epoch": 0.24491896063802418, + "grad_norm": 0.14716887474060059, + "learning_rate": 1.9495990836197025e-05, + "loss": 0.3164, + "step": 119 + }, + { + "epoch": 0.24697710316439414, + "grad_norm": 0.16582027077674866, + "learning_rate": 1.9473081328751433e-05, + "loss": 0.3191, + "step": 120 + }, + { + "epoch": 0.24903524569076407, + "grad_norm": 0.15213699638843536, + "learning_rate": 1.9450171821305842e-05, + "loss": 0.304, + "step": 121 + }, + { + "epoch": 0.25109338821713406, + "grad_norm": 0.1659238487482071, + "learning_rate": 1.9427262313860255e-05, + "loss": 0.3184, + "step": 122 + }, + { + "epoch": 0.253151530743504, + "grad_norm": 0.15596656501293182, + "learning_rate": 1.9404352806414663e-05, + "loss": 0.3092, + "step": 123 + }, + { + "epoch": 0.2552096732698739, + "grad_norm": 0.15868476033210754, + "learning_rate": 1.9381443298969072e-05, + "loss": 0.3163, + "step": 124 + }, + { + "epoch": 0.2572678157962439, + "grad_norm": 0.15386095643043518, + "learning_rate": 1.9358533791523485e-05, + "loss": 0.3049, + "step": 125 + }, + { + "epoch": 0.25932595832261385, + "grad_norm": 0.15179213881492615, + "learning_rate": 1.9335624284077894e-05, + "loss": 0.3131, + "step": 126 + }, + { + "epoch": 0.2613841008489838, + "grad_norm": 0.1595134735107422, + "learning_rate": 1.9312714776632306e-05, + "loss": 0.3069, + "step": 127 + }, + { + "epoch": 0.26344224337535377, + "grad_norm": 0.16989803314208984, + "learning_rate": 1.9289805269186715e-05, + "loss": 0.3052, + "step": 128 + }, + { + "epoch": 0.2655003859017237, + "grad_norm": 0.14803892374038696, + "learning_rate": 1.9266895761741124e-05, + "loss": 0.3065, + "step": 129 + }, + { + "epoch": 0.26755852842809363, + "grad_norm": 0.16676583886146545, + "learning_rate": 1.9243986254295536e-05, + "loss": 0.2962, + "step": 130 + }, + { + "epoch": 0.2696166709544636, + "grad_norm": 0.15694552659988403, + "learning_rate": 1.9221076746849945e-05, + "loss": 0.3096, + "step": 131 + }, + { + "epoch": 0.27167481348083355, + "grad_norm": 0.17696696519851685, + "learning_rate": 1.9198167239404354e-05, + "loss": 0.3145, + "step": 132 + }, + { + "epoch": 0.2737329560072035, + "grad_norm": 0.17204038798809052, + "learning_rate": 1.9175257731958766e-05, + "loss": 0.3248, + "step": 133 + }, + { + "epoch": 0.2757910985335735, + "grad_norm": 0.15630359947681427, + "learning_rate": 1.9152348224513175e-05, + "loss": 0.3117, + "step": 134 + }, + { + "epoch": 0.2778492410599434, + "grad_norm": 0.15757997334003448, + "learning_rate": 1.9129438717067584e-05, + "loss": 0.3145, + "step": 135 + }, + { + "epoch": 0.27990738358631334, + "grad_norm": 0.16273653507232666, + "learning_rate": 1.9106529209621996e-05, + "loss": 0.3159, + "step": 136 + }, + { + "epoch": 0.28196552611268333, + "grad_norm": 0.16213104128837585, + "learning_rate": 1.9083619702176405e-05, + "loss": 0.2949, + "step": 137 + }, + { + "epoch": 0.28402366863905326, + "grad_norm": 0.15377865731716156, + "learning_rate": 1.9060710194730814e-05, + "loss": 0.306, + "step": 138 + }, + { + "epoch": 0.2860818111654232, + "grad_norm": 0.1545962244272232, + "learning_rate": 1.9037800687285223e-05, + "loss": 0.2966, + "step": 139 + }, + { + "epoch": 0.2881399536917932, + "grad_norm": 0.15516617894172668, + "learning_rate": 1.9014891179839635e-05, + "loss": 0.3122, + "step": 140 + }, + { + "epoch": 0.2901980962181631, + "grad_norm": 0.14734458923339844, + "learning_rate": 1.8991981672394044e-05, + "loss": 0.3118, + "step": 141 + }, + { + "epoch": 0.29225623874453305, + "grad_norm": 0.1644304096698761, + "learning_rate": 1.8969072164948453e-05, + "loss": 0.3027, + "step": 142 + }, + { + "epoch": 0.29431438127090304, + "grad_norm": 0.14632569253444672, + "learning_rate": 1.8946162657502865e-05, + "loss": 0.3023, + "step": 143 + }, + { + "epoch": 0.29637252379727297, + "grad_norm": 0.1573137789964676, + "learning_rate": 1.8923253150057274e-05, + "loss": 0.3102, + "step": 144 + }, + { + "epoch": 0.2984306663236429, + "grad_norm": 0.16423144936561584, + "learning_rate": 1.8900343642611686e-05, + "loss": 0.3033, + "step": 145 + }, + { + "epoch": 0.3004888088500129, + "grad_norm": 0.15420907735824585, + "learning_rate": 1.8877434135166095e-05, + "loss": 0.3089, + "step": 146 + }, + { + "epoch": 0.3025469513763828, + "grad_norm": 0.1579178273677826, + "learning_rate": 1.8854524627720504e-05, + "loss": 0.3071, + "step": 147 + }, + { + "epoch": 0.30460509390275275, + "grad_norm": 0.15866397321224213, + "learning_rate": 1.8831615120274916e-05, + "loss": 0.3083, + "step": 148 + }, + { + "epoch": 0.30666323642912274, + "grad_norm": 0.16651487350463867, + "learning_rate": 1.8808705612829325e-05, + "loss": 0.3099, + "step": 149 + }, + { + "epoch": 0.3087213789554927, + "grad_norm": 0.16281908750534058, + "learning_rate": 1.8785796105383734e-05, + "loss": 0.3034, + "step": 150 + }, + { + "epoch": 0.3107795214818626, + "grad_norm": 0.17449837923049927, + "learning_rate": 1.8762886597938147e-05, + "loss": 0.3054, + "step": 151 + }, + { + "epoch": 0.3128376640082326, + "grad_norm": 0.15403546392917633, + "learning_rate": 1.8739977090492555e-05, + "loss": 0.297, + "step": 152 + }, + { + "epoch": 0.31489580653460253, + "grad_norm": 0.1472466140985489, + "learning_rate": 1.8717067583046968e-05, + "loss": 0.2973, + "step": 153 + }, + { + "epoch": 0.31695394906097246, + "grad_norm": 0.16027937829494476, + "learning_rate": 1.8694158075601377e-05, + "loss": 0.3054, + "step": 154 + }, + { + "epoch": 0.31901209158734245, + "grad_norm": 0.17086225748062134, + "learning_rate": 1.8671248568155786e-05, + "loss": 0.307, + "step": 155 + }, + { + "epoch": 0.3210702341137124, + "grad_norm": 0.15930697321891785, + "learning_rate": 1.8648339060710198e-05, + "loss": 0.293, + "step": 156 + }, + { + "epoch": 0.3231283766400823, + "grad_norm": 0.17086376249790192, + "learning_rate": 1.8625429553264607e-05, + "loss": 0.293, + "step": 157 + }, + { + "epoch": 0.3251865191664523, + "grad_norm": 0.15970875322818756, + "learning_rate": 1.8602520045819016e-05, + "loss": 0.3083, + "step": 158 + }, + { + "epoch": 0.32724466169282224, + "grad_norm": 0.16355909407138824, + "learning_rate": 1.8579610538373428e-05, + "loss": 0.3139, + "step": 159 + }, + { + "epoch": 0.32930280421919217, + "grad_norm": 0.15183711051940918, + "learning_rate": 1.8556701030927837e-05, + "loss": 0.2953, + "step": 160 + }, + { + "epoch": 0.33136094674556216, + "grad_norm": 0.15123715996742249, + "learning_rate": 1.853379152348225e-05, + "loss": 0.3025, + "step": 161 + }, + { + "epoch": 0.3334190892719321, + "grad_norm": 0.1576143503189087, + "learning_rate": 1.8510882016036658e-05, + "loss": 0.2904, + "step": 162 + }, + { + "epoch": 0.335477231798302, + "grad_norm": 0.1457504779100418, + "learning_rate": 1.8487972508591067e-05, + "loss": 0.2909, + "step": 163 + }, + { + "epoch": 0.337535374324672, + "grad_norm": 0.1557442992925644, + "learning_rate": 1.846506300114548e-05, + "loss": 0.3027, + "step": 164 + }, + { + "epoch": 0.33959351685104194, + "grad_norm": 0.15662318468093872, + "learning_rate": 1.8442153493699888e-05, + "loss": 0.311, + "step": 165 + }, + { + "epoch": 0.3416516593774119, + "grad_norm": 0.16177058219909668, + "learning_rate": 1.8419243986254297e-05, + "loss": 0.2944, + "step": 166 + }, + { + "epoch": 0.34370980190378186, + "grad_norm": 0.16406729817390442, + "learning_rate": 1.8396334478808706e-05, + "loss": 0.2927, + "step": 167 + }, + { + "epoch": 0.3457679444301518, + "grad_norm": 0.16642791032791138, + "learning_rate": 1.8373424971363115e-05, + "loss": 0.3063, + "step": 168 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.1650693714618683, + "learning_rate": 1.8350515463917527e-05, + "loss": 0.2957, + "step": 169 + }, + { + "epoch": 0.3498842294828917, + "grad_norm": 0.15349675714969635, + "learning_rate": 1.8327605956471936e-05, + "loss": 0.297, + "step": 170 + }, + { + "epoch": 0.35194237200926165, + "grad_norm": 0.17770209908485413, + "learning_rate": 1.8304696449026348e-05, + "loss": 0.3011, + "step": 171 + }, + { + "epoch": 0.3540005145356316, + "grad_norm": 0.1647631675004959, + "learning_rate": 1.8281786941580757e-05, + "loss": 0.2962, + "step": 172 + }, + { + "epoch": 0.35605865706200157, + "grad_norm": 0.1603834480047226, + "learning_rate": 1.8258877434135166e-05, + "loss": 0.2937, + "step": 173 + }, + { + "epoch": 0.3581167995883715, + "grad_norm": 0.16780880093574524, + "learning_rate": 1.8235967926689578e-05, + "loss": 0.2997, + "step": 174 + }, + { + "epoch": 0.36017494211474144, + "grad_norm": 0.15976767241954803, + "learning_rate": 1.8213058419243987e-05, + "loss": 0.3043, + "step": 175 + }, + { + "epoch": 0.3622330846411114, + "grad_norm": 0.16236485540866852, + "learning_rate": 1.8190148911798396e-05, + "loss": 0.3069, + "step": 176 + }, + { + "epoch": 0.36429122716748136, + "grad_norm": 0.16391968727111816, + "learning_rate": 1.816723940435281e-05, + "loss": 0.2923, + "step": 177 + }, + { + "epoch": 0.3663493696938513, + "grad_norm": 0.15806889533996582, + "learning_rate": 1.8144329896907217e-05, + "loss": 0.2872, + "step": 178 + }, + { + "epoch": 0.3684075122202212, + "grad_norm": 0.1627352088689804, + "learning_rate": 1.812142038946163e-05, + "loss": 0.3032, + "step": 179 + }, + { + "epoch": 0.3704656547465912, + "grad_norm": 0.15103371441364288, + "learning_rate": 1.809851088201604e-05, + "loss": 0.2847, + "step": 180 + }, + { + "epoch": 0.37252379727296114, + "grad_norm": 0.15178488194942474, + "learning_rate": 1.8075601374570447e-05, + "loss": 0.3017, + "step": 181 + }, + { + "epoch": 0.3745819397993311, + "grad_norm": 0.15493899583816528, + "learning_rate": 1.805269186712486e-05, + "loss": 0.2901, + "step": 182 + }, + { + "epoch": 0.37664008232570106, + "grad_norm": 0.15990686416625977, + "learning_rate": 1.802978235967927e-05, + "loss": 0.2861, + "step": 183 + }, + { + "epoch": 0.378698224852071, + "grad_norm": 0.15824148058891296, + "learning_rate": 1.8006872852233677e-05, + "loss": 0.2885, + "step": 184 + }, + { + "epoch": 0.38075636737844093, + "grad_norm": 0.15690775215625763, + "learning_rate": 1.798396334478809e-05, + "loss": 0.2814, + "step": 185 + }, + { + "epoch": 0.3828145099048109, + "grad_norm": 0.15833796560764313, + "learning_rate": 1.79610538373425e-05, + "loss": 0.2847, + "step": 186 + }, + { + "epoch": 0.38487265243118085, + "grad_norm": 0.16560044884681702, + "learning_rate": 1.793814432989691e-05, + "loss": 0.3061, + "step": 187 + }, + { + "epoch": 0.3869307949575508, + "grad_norm": 0.16240179538726807, + "learning_rate": 1.791523482245132e-05, + "loss": 0.2943, + "step": 188 + }, + { + "epoch": 0.38898893748392077, + "grad_norm": 0.15825721621513367, + "learning_rate": 1.789232531500573e-05, + "loss": 0.2934, + "step": 189 + }, + { + "epoch": 0.3910470800102907, + "grad_norm": 0.16665388643741608, + "learning_rate": 1.786941580756014e-05, + "loss": 0.291, + "step": 190 + }, + { + "epoch": 0.39310522253666064, + "grad_norm": 0.16581200063228607, + "learning_rate": 1.784650630011455e-05, + "loss": 0.2849, + "step": 191 + }, + { + "epoch": 0.3951633650630306, + "grad_norm": 0.1604345291852951, + "learning_rate": 1.782359679266896e-05, + "loss": 0.3, + "step": 192 + }, + { + "epoch": 0.39722150758940056, + "grad_norm": 0.16107915341854095, + "learning_rate": 1.7800687285223368e-05, + "loss": 0.2847, + "step": 193 + }, + { + "epoch": 0.3992796501157705, + "grad_norm": 0.1571730375289917, + "learning_rate": 1.7777777777777777e-05, + "loss": 0.2863, + "step": 194 + }, + { + "epoch": 0.4013377926421405, + "grad_norm": 0.1656399518251419, + "learning_rate": 1.775486827033219e-05, + "loss": 0.2878, + "step": 195 + }, + { + "epoch": 0.4033959351685104, + "grad_norm": 0.16738460958003998, + "learning_rate": 1.7731958762886598e-05, + "loss": 0.286, + "step": 196 + }, + { + "epoch": 0.40545407769488034, + "grad_norm": 0.16704292595386505, + "learning_rate": 1.770904925544101e-05, + "loss": 0.2919, + "step": 197 + }, + { + "epoch": 0.40751222022125033, + "grad_norm": 0.16215579211711884, + "learning_rate": 1.768613974799542e-05, + "loss": 0.2874, + "step": 198 + }, + { + "epoch": 0.40957036274762026, + "grad_norm": 0.15573479235172272, + "learning_rate": 1.7663230240549828e-05, + "loss": 0.2904, + "step": 199 + }, + { + "epoch": 0.4116285052739902, + "grad_norm": 0.1707623153924942, + "learning_rate": 1.764032073310424e-05, + "loss": 0.289, + "step": 200 + }, + { + "epoch": 0.4116285052739902, + "eval_loss": 0.3214050829410553, + "eval_runtime": 2449.7742, + "eval_samples_per_second": 3.173, + "eval_steps_per_second": 0.794, + "step": 200 + }, + { + "epoch": 0.4136866478003602, + "grad_norm": 0.1699172556400299, + "learning_rate": 1.761741122565865e-05, + "loss": 0.2852, + "step": 201 + }, + { + "epoch": 0.4157447903267301, + "grad_norm": 0.19150058925151825, + "learning_rate": 1.7594501718213058e-05, + "loss": 0.29, + "step": 202 + }, + { + "epoch": 0.41780293285310005, + "grad_norm": 0.15794627368450165, + "learning_rate": 1.757159221076747e-05, + "loss": 0.2746, + "step": 203 + }, + { + "epoch": 0.41986107537947004, + "grad_norm": 0.17305190861225128, + "learning_rate": 1.754868270332188e-05, + "loss": 0.3003, + "step": 204 + }, + { + "epoch": 0.42191921790583997, + "grad_norm": 0.16257523000240326, + "learning_rate": 1.752577319587629e-05, + "loss": 0.2789, + "step": 205 + }, + { + "epoch": 0.4239773604322099, + "grad_norm": 0.17273619771003723, + "learning_rate": 1.75028636884307e-05, + "loss": 0.2917, + "step": 206 + }, + { + "epoch": 0.4260355029585799, + "grad_norm": 0.17502790689468384, + "learning_rate": 1.747995418098511e-05, + "loss": 0.2992, + "step": 207 + }, + { + "epoch": 0.4280936454849498, + "grad_norm": 0.16464050114154816, + "learning_rate": 1.745704467353952e-05, + "loss": 0.2873, + "step": 208 + }, + { + "epoch": 0.43015178801131976, + "grad_norm": 0.1681668758392334, + "learning_rate": 1.743413516609393e-05, + "loss": 0.2991, + "step": 209 + }, + { + "epoch": 0.43220993053768975, + "grad_norm": 0.16957956552505493, + "learning_rate": 1.741122565864834e-05, + "loss": 0.2868, + "step": 210 + }, + { + "epoch": 0.4342680730640597, + "grad_norm": 0.15875883400440216, + "learning_rate": 1.738831615120275e-05, + "loss": 0.2946, + "step": 211 + }, + { + "epoch": 0.4363262155904296, + "grad_norm": 0.18127889931201935, + "learning_rate": 1.736540664375716e-05, + "loss": 0.2835, + "step": 212 + }, + { + "epoch": 0.4383843581167996, + "grad_norm": 0.17822811007499695, + "learning_rate": 1.7342497136311573e-05, + "loss": 0.2944, + "step": 213 + }, + { + "epoch": 0.44044250064316953, + "grad_norm": 0.17555806040763855, + "learning_rate": 1.731958762886598e-05, + "loss": 0.3001, + "step": 214 + }, + { + "epoch": 0.44250064316953946, + "grad_norm": 0.18709121644496918, + "learning_rate": 1.729667812142039e-05, + "loss": 0.282, + "step": 215 + }, + { + "epoch": 0.44455878569590945, + "grad_norm": 0.16322475671768188, + "learning_rate": 1.7273768613974803e-05, + "loss": 0.2883, + "step": 216 + }, + { + "epoch": 0.4466169282222794, + "grad_norm": 0.1677054911851883, + "learning_rate": 1.7250859106529212e-05, + "loss": 0.28, + "step": 217 + }, + { + "epoch": 0.4486750707486493, + "grad_norm": 0.15764063596725464, + "learning_rate": 1.722794959908362e-05, + "loss": 0.2768, + "step": 218 + }, + { + "epoch": 0.4507332132750193, + "grad_norm": 0.16166841983795166, + "learning_rate": 1.7205040091638033e-05, + "loss": 0.2868, + "step": 219 + }, + { + "epoch": 0.45279135580138924, + "grad_norm": 0.1799350380897522, + "learning_rate": 1.7182130584192442e-05, + "loss": 0.2891, + "step": 220 + }, + { + "epoch": 0.45484949832775917, + "grad_norm": 0.18119174242019653, + "learning_rate": 1.715922107674685e-05, + "loss": 0.2841, + "step": 221 + }, + { + "epoch": 0.45690764085412916, + "grad_norm": 0.17725548148155212, + "learning_rate": 1.713631156930126e-05, + "loss": 0.3038, + "step": 222 + }, + { + "epoch": 0.4589657833804991, + "grad_norm": 0.1628233790397644, + "learning_rate": 1.7113402061855672e-05, + "loss": 0.2868, + "step": 223 + }, + { + "epoch": 0.461023925906869, + "grad_norm": 0.1745166927576065, + "learning_rate": 1.709049255441008e-05, + "loss": 0.3033, + "step": 224 + }, + { + "epoch": 0.463082068433239, + "grad_norm": 0.17708267271518707, + "learning_rate": 1.706758304696449e-05, + "loss": 0.2842, + "step": 225 + }, + { + "epoch": 0.46514021095960895, + "grad_norm": 0.1738453358411789, + "learning_rate": 1.7044673539518902e-05, + "loss": 0.3005, + "step": 226 + }, + { + "epoch": 0.4671983534859789, + "grad_norm": 0.1706874966621399, + "learning_rate": 1.702176403207331e-05, + "loss": 0.2924, + "step": 227 + }, + { + "epoch": 0.46925649601234887, + "grad_norm": 0.1697423756122589, + "learning_rate": 1.699885452462772e-05, + "loss": 0.2783, + "step": 228 + }, + { + "epoch": 0.4713146385387188, + "grad_norm": 0.1783403754234314, + "learning_rate": 1.6975945017182132e-05, + "loss": 0.2924, + "step": 229 + }, + { + "epoch": 0.47337278106508873, + "grad_norm": 0.17431536316871643, + "learning_rate": 1.695303550973654e-05, + "loss": 0.2792, + "step": 230 + }, + { + "epoch": 0.4754309235914587, + "grad_norm": 0.164026141166687, + "learning_rate": 1.6930126002290953e-05, + "loss": 0.2825, + "step": 231 + }, + { + "epoch": 0.47748906611782865, + "grad_norm": 0.16449657082557678, + "learning_rate": 1.6907216494845362e-05, + "loss": 0.2831, + "step": 232 + }, + { + "epoch": 0.4795472086441986, + "grad_norm": 0.1812741607427597, + "learning_rate": 1.688430698739977e-05, + "loss": 0.2849, + "step": 233 + }, + { + "epoch": 0.4816053511705686, + "grad_norm": 0.18431834876537323, + "learning_rate": 1.6861397479954183e-05, + "loss": 0.2802, + "step": 234 + }, + { + "epoch": 0.4836634936969385, + "grad_norm": 0.18349015712738037, + "learning_rate": 1.6838487972508592e-05, + "loss": 0.2804, + "step": 235 + }, + { + "epoch": 0.48572163622330844, + "grad_norm": 0.1769968420267105, + "learning_rate": 1.6815578465063e-05, + "loss": 0.2777, + "step": 236 + }, + { + "epoch": 0.4877797787496784, + "grad_norm": 0.17207500338554382, + "learning_rate": 1.6792668957617413e-05, + "loss": 0.2883, + "step": 237 + }, + { + "epoch": 0.48983792127604836, + "grad_norm": 0.1729692667722702, + "learning_rate": 1.6769759450171822e-05, + "loss": 0.2784, + "step": 238 + }, + { + "epoch": 0.4918960638024183, + "grad_norm": 0.17234881222248077, + "learning_rate": 1.6746849942726235e-05, + "loss": 0.2816, + "step": 239 + }, + { + "epoch": 0.4939542063287883, + "grad_norm": 0.17132551968097687, + "learning_rate": 1.6723940435280644e-05, + "loss": 0.2812, + "step": 240 + }, + { + "epoch": 0.4960123488551582, + "grad_norm": 0.1752254068851471, + "learning_rate": 1.6701030927835052e-05, + "loss": 0.2799, + "step": 241 + }, + { + "epoch": 0.49807049138152815, + "grad_norm": 0.1768665313720703, + "learning_rate": 1.6678121420389465e-05, + "loss": 0.2966, + "step": 242 + }, + { + "epoch": 0.5001286339078981, + "grad_norm": 0.18139514327049255, + "learning_rate": 1.6655211912943874e-05, + "loss": 0.2816, + "step": 243 + }, + { + "epoch": 0.5021867764342681, + "grad_norm": 0.17312943935394287, + "learning_rate": 1.6632302405498283e-05, + "loss": 0.2845, + "step": 244 + }, + { + "epoch": 0.5042449189606381, + "grad_norm": 0.17966389656066895, + "learning_rate": 1.6609392898052695e-05, + "loss": 0.2864, + "step": 245 + }, + { + "epoch": 0.506303061487008, + "grad_norm": 0.16653811931610107, + "learning_rate": 1.6586483390607104e-05, + "loss": 0.2759, + "step": 246 + }, + { + "epoch": 0.5083612040133779, + "grad_norm": 0.1634613424539566, + "learning_rate": 1.6563573883161516e-05, + "loss": 0.2728, + "step": 247 + }, + { + "epoch": 0.5104193465397479, + "grad_norm": 0.17358507215976715, + "learning_rate": 1.654066437571592e-05, + "loss": 0.2706, + "step": 248 + }, + { + "epoch": 0.5124774890661178, + "grad_norm": 0.17524316906929016, + "learning_rate": 1.6517754868270334e-05, + "loss": 0.2805, + "step": 249 + }, + { + "epoch": 0.5145356315924878, + "grad_norm": 0.18134094774723053, + "learning_rate": 1.6494845360824743e-05, + "loss": 0.2909, + "step": 250 + }, + { + "epoch": 0.5165937741188578, + "grad_norm": 0.17795510590076447, + "learning_rate": 1.647193585337915e-05, + "loss": 0.2889, + "step": 251 + }, + { + "epoch": 0.5186519166452277, + "grad_norm": 0.16782547533512115, + "learning_rate": 1.6449026345933564e-05, + "loss": 0.2842, + "step": 252 + }, + { + "epoch": 0.5207100591715976, + "grad_norm": 0.17360062897205353, + "learning_rate": 1.6426116838487973e-05, + "loss": 0.2763, + "step": 253 + }, + { + "epoch": 0.5227682016979676, + "grad_norm": 0.17241406440734863, + "learning_rate": 1.6403207331042385e-05, + "loss": 0.2753, + "step": 254 + }, + { + "epoch": 0.5248263442243375, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.6380297823596794e-05, + "loss": 0.2732, + "step": 255 + }, + { + "epoch": 0.5268844867507075, + "grad_norm": 0.1807374209165573, + "learning_rate": 1.6357388316151203e-05, + "loss": 0.2856, + "step": 256 + }, + { + "epoch": 0.5289426292770775, + "grad_norm": 0.1749904304742813, + "learning_rate": 1.6334478808705615e-05, + "loss": 0.285, + "step": 257 + }, + { + "epoch": 0.5310007718034474, + "grad_norm": 0.16673170030117035, + "learning_rate": 1.6311569301260024e-05, + "loss": 0.2825, + "step": 258 + }, + { + "epoch": 0.5330589143298173, + "grad_norm": 0.17239685356616974, + "learning_rate": 1.6288659793814433e-05, + "loss": 0.2845, + "step": 259 + }, + { + "epoch": 0.5351170568561873, + "grad_norm": 0.1831504851579666, + "learning_rate": 1.6265750286368845e-05, + "loss": 0.2859, + "step": 260 + }, + { + "epoch": 0.5371751993825572, + "grad_norm": 0.18507827818393707, + "learning_rate": 1.6242840778923254e-05, + "loss": 0.293, + "step": 261 + }, + { + "epoch": 0.5392333419089272, + "grad_norm": 0.16738134622573853, + "learning_rate": 1.6219931271477663e-05, + "loss": 0.2853, + "step": 262 + }, + { + "epoch": 0.5412914844352972, + "grad_norm": 0.1701226830482483, + "learning_rate": 1.6197021764032075e-05, + "loss": 0.2763, + "step": 263 + }, + { + "epoch": 0.5433496269616671, + "grad_norm": 0.18195705115795135, + "learning_rate": 1.6174112256586484e-05, + "loss": 0.2797, + "step": 264 + }, + { + "epoch": 0.545407769488037, + "grad_norm": 0.1832309514284134, + "learning_rate": 1.6151202749140896e-05, + "loss": 0.2885, + "step": 265 + }, + { + "epoch": 0.547465912014407, + "grad_norm": 0.1773810088634491, + "learning_rate": 1.6128293241695305e-05, + "loss": 0.2682, + "step": 266 + }, + { + "epoch": 0.5495240545407769, + "grad_norm": 0.16989603638648987, + "learning_rate": 1.6105383734249714e-05, + "loss": 0.2821, + "step": 267 + }, + { + "epoch": 0.551582197067147, + "grad_norm": 0.17835170030593872, + "learning_rate": 1.6082474226804127e-05, + "loss": 0.2774, + "step": 268 + }, + { + "epoch": 0.5536403395935169, + "grad_norm": 0.1777082234621048, + "learning_rate": 1.6059564719358535e-05, + "loss": 0.2726, + "step": 269 + }, + { + "epoch": 0.5556984821198868, + "grad_norm": 0.18766450881958008, + "learning_rate": 1.6036655211912944e-05, + "loss": 0.2879, + "step": 270 + }, + { + "epoch": 0.5577566246462567, + "grad_norm": 0.1868186593055725, + "learning_rate": 1.6013745704467357e-05, + "loss": 0.2808, + "step": 271 + }, + { + "epoch": 0.5598147671726267, + "grad_norm": 0.16695882380008698, + "learning_rate": 1.5990836197021766e-05, + "loss": 0.2668, + "step": 272 + }, + { + "epoch": 0.5618729096989966, + "grad_norm": 0.17224495112895966, + "learning_rate": 1.5967926689576178e-05, + "loss": 0.2682, + "step": 273 + }, + { + "epoch": 0.5639310522253667, + "grad_norm": 0.20116423070430756, + "learning_rate": 1.5945017182130587e-05, + "loss": 0.276, + "step": 274 + }, + { + "epoch": 0.5659891947517366, + "grad_norm": 0.19478343427181244, + "learning_rate": 1.5922107674684996e-05, + "loss": 0.2854, + "step": 275 + }, + { + "epoch": 0.5680473372781065, + "grad_norm": 0.20242950320243835, + "learning_rate": 1.5899198167239405e-05, + "loss": 0.2854, + "step": 276 + }, + { + "epoch": 0.5701054798044765, + "grad_norm": 0.19146093726158142, + "learning_rate": 1.5876288659793813e-05, + "loss": 0.2817, + "step": 277 + }, + { + "epoch": 0.5721636223308464, + "grad_norm": 0.1804896742105484, + "learning_rate": 1.5853379152348226e-05, + "loss": 0.2714, + "step": 278 + }, + { + "epoch": 0.5742217648572163, + "grad_norm": 0.19315646588802338, + "learning_rate": 1.5830469644902635e-05, + "loss": 0.2703, + "step": 279 + }, + { + "epoch": 0.5762799073835864, + "grad_norm": 0.1910266876220703, + "learning_rate": 1.5807560137457047e-05, + "loss": 0.2728, + "step": 280 + }, + { + "epoch": 0.5783380499099563, + "grad_norm": 0.20330773293972015, + "learning_rate": 1.5784650630011456e-05, + "loss": 0.2717, + "step": 281 + }, + { + "epoch": 0.5803961924363262, + "grad_norm": 0.19080683588981628, + "learning_rate": 1.5761741122565865e-05, + "loss": 0.2679, + "step": 282 + }, + { + "epoch": 0.5824543349626962, + "grad_norm": 0.18052135407924652, + "learning_rate": 1.5738831615120277e-05, + "loss": 0.2815, + "step": 283 + }, + { + "epoch": 0.5845124774890661, + "grad_norm": 0.1998361051082611, + "learning_rate": 1.5715922107674686e-05, + "loss": 0.2888, + "step": 284 + }, + { + "epoch": 0.586570620015436, + "grad_norm": 0.1978764683008194, + "learning_rate": 1.5693012600229095e-05, + "loss": 0.2926, + "step": 285 + }, + { + "epoch": 0.5886287625418061, + "grad_norm": 0.17189203202724457, + "learning_rate": 1.5670103092783507e-05, + "loss": 0.2674, + "step": 286 + }, + { + "epoch": 0.590686905068176, + "grad_norm": 0.1937166303396225, + "learning_rate": 1.5647193585337916e-05, + "loss": 0.2838, + "step": 287 + }, + { + "epoch": 0.5927450475945459, + "grad_norm": 0.18978627026081085, + "learning_rate": 1.5624284077892328e-05, + "loss": 0.273, + "step": 288 + }, + { + "epoch": 0.5948031901209159, + "grad_norm": 0.17718705534934998, + "learning_rate": 1.5601374570446737e-05, + "loss": 0.2842, + "step": 289 + }, + { + "epoch": 0.5968613326472858, + "grad_norm": 0.1912536770105362, + "learning_rate": 1.5578465063001146e-05, + "loss": 0.2736, + "step": 290 + }, + { + "epoch": 0.5989194751736557, + "grad_norm": 0.18104907870292664, + "learning_rate": 1.555555555555556e-05, + "loss": 0.274, + "step": 291 + }, + { + "epoch": 0.6009776177000258, + "grad_norm": 0.1620381772518158, + "learning_rate": 1.5532646048109967e-05, + "loss": 0.2663, + "step": 292 + }, + { + "epoch": 0.6030357602263957, + "grad_norm": 0.17973916232585907, + "learning_rate": 1.5509736540664376e-05, + "loss": 0.2791, + "step": 293 + }, + { + "epoch": 0.6050939027527656, + "grad_norm": 0.16821186244487762, + "learning_rate": 1.548682703321879e-05, + "loss": 0.2787, + "step": 294 + }, + { + "epoch": 0.6071520452791356, + "grad_norm": 0.18426693975925446, + "learning_rate": 1.5463917525773197e-05, + "loss": 0.2886, + "step": 295 + }, + { + "epoch": 0.6092101878055055, + "grad_norm": 0.19796033203601837, + "learning_rate": 1.5441008018327606e-05, + "loss": 0.268, + "step": 296 + }, + { + "epoch": 0.6112683303318754, + "grad_norm": 0.1971343755722046, + "learning_rate": 1.541809851088202e-05, + "loss": 0.2761, + "step": 297 + }, + { + "epoch": 0.6133264728582455, + "grad_norm": 0.17458567023277283, + "learning_rate": 1.5395189003436427e-05, + "loss": 0.2831, + "step": 298 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.17610400915145874, + "learning_rate": 1.537227949599084e-05, + "loss": 0.2691, + "step": 299 + }, + { + "epoch": 0.6174427579109854, + "grad_norm": 0.1929042488336563, + "learning_rate": 1.534936998854525e-05, + "loss": 0.2847, + "step": 300 + }, + { + "epoch": 0.6174427579109854, + "eval_loss": 0.2959522604942322, + "eval_runtime": 2428.6339, + "eval_samples_per_second": 3.201, + "eval_steps_per_second": 0.8, + "step": 300 + }, + { + "epoch": 0.6195009004373553, + "grad_norm": 0.19430233538150787, + "learning_rate": 1.5326460481099657e-05, + "loss": 0.279, + "step": 301 + }, + { + "epoch": 0.6215590429637252, + "grad_norm": 0.18542642891407013, + "learning_rate": 1.5303550973654066e-05, + "loss": 0.2695, + "step": 302 + }, + { + "epoch": 0.6236171854900951, + "grad_norm": 0.1850169450044632, + "learning_rate": 1.5280641466208475e-05, + "loss": 0.2847, + "step": 303 + }, + { + "epoch": 0.6256753280164652, + "grad_norm": 0.18449267745018005, + "learning_rate": 1.5257731958762888e-05, + "loss": 0.2804, + "step": 304 + }, + { + "epoch": 0.6277334705428351, + "grad_norm": 0.18608458340168, + "learning_rate": 1.5234822451317296e-05, + "loss": 0.2792, + "step": 305 + }, + { + "epoch": 0.6297916130692051, + "grad_norm": 0.21136076748371124, + "learning_rate": 1.5211912943871707e-05, + "loss": 0.2829, + "step": 306 + }, + { + "epoch": 0.631849755595575, + "grad_norm": 0.19672206044197083, + "learning_rate": 1.5189003436426118e-05, + "loss": 0.2854, + "step": 307 + }, + { + "epoch": 0.6339078981219449, + "grad_norm": 0.1834034025669098, + "learning_rate": 1.5166093928980528e-05, + "loss": 0.2775, + "step": 308 + }, + { + "epoch": 0.6359660406483149, + "grad_norm": 0.18414819240570068, + "learning_rate": 1.5143184421534937e-05, + "loss": 0.2794, + "step": 309 + }, + { + "epoch": 0.6380241831746849, + "grad_norm": 0.1890152245759964, + "learning_rate": 1.5120274914089348e-05, + "loss": 0.2718, + "step": 310 + }, + { + "epoch": 0.6400823257010548, + "grad_norm": 0.18923887610435486, + "learning_rate": 1.5097365406643758e-05, + "loss": 0.2795, + "step": 311 + }, + { + "epoch": 0.6421404682274248, + "grad_norm": 0.20047079026699066, + "learning_rate": 1.5074455899198169e-05, + "loss": 0.2811, + "step": 312 + }, + { + "epoch": 0.6441986107537947, + "grad_norm": 0.1910201609134674, + "learning_rate": 1.5051546391752578e-05, + "loss": 0.2732, + "step": 313 + }, + { + "epoch": 0.6462567532801646, + "grad_norm": 0.2021956443786621, + "learning_rate": 1.5028636884306988e-05, + "loss": 0.2806, + "step": 314 + }, + { + "epoch": 0.6483148958065346, + "grad_norm": 0.18957914412021637, + "learning_rate": 1.5005727376861399e-05, + "loss": 0.2681, + "step": 315 + }, + { + "epoch": 0.6503730383329046, + "grad_norm": 0.19858811795711517, + "learning_rate": 1.498281786941581e-05, + "loss": 0.2805, + "step": 316 + }, + { + "epoch": 0.6524311808592745, + "grad_norm": 0.1731935292482376, + "learning_rate": 1.4959908361970218e-05, + "loss": 0.2646, + "step": 317 + }, + { + "epoch": 0.6544893233856445, + "grad_norm": 0.19619058072566986, + "learning_rate": 1.4936998854524629e-05, + "loss": 0.2965, + "step": 318 + }, + { + "epoch": 0.6565474659120144, + "grad_norm": 0.18745696544647217, + "learning_rate": 1.491408934707904e-05, + "loss": 0.2766, + "step": 319 + }, + { + "epoch": 0.6586056084383843, + "grad_norm": 0.18006449937820435, + "learning_rate": 1.489117983963345e-05, + "loss": 0.2788, + "step": 320 + }, + { + "epoch": 0.6606637509647543, + "grad_norm": 0.17593689262866974, + "learning_rate": 1.486827033218786e-05, + "loss": 0.2813, + "step": 321 + }, + { + "epoch": 0.6627218934911243, + "grad_norm": 0.18695640563964844, + "learning_rate": 1.484536082474227e-05, + "loss": 0.281, + "step": 322 + }, + { + "epoch": 0.6647800360174942, + "grad_norm": 0.17909488081932068, + "learning_rate": 1.482245131729668e-05, + "loss": 0.2814, + "step": 323 + }, + { + "epoch": 0.6668381785438642, + "grad_norm": 0.19074076414108276, + "learning_rate": 1.4799541809851091e-05, + "loss": 0.2721, + "step": 324 + }, + { + "epoch": 0.6688963210702341, + "grad_norm": 0.19175754487514496, + "learning_rate": 1.47766323024055e-05, + "loss": 0.2754, + "step": 325 + }, + { + "epoch": 0.670954463596604, + "grad_norm": 0.18646575510501862, + "learning_rate": 1.475372279495991e-05, + "loss": 0.2678, + "step": 326 + }, + { + "epoch": 0.673012606122974, + "grad_norm": 0.18553243577480316, + "learning_rate": 1.4730813287514321e-05, + "loss": 0.281, + "step": 327 + }, + { + "epoch": 0.675070748649344, + "grad_norm": 0.17120976746082306, + "learning_rate": 1.470790378006873e-05, + "loss": 0.2691, + "step": 328 + }, + { + "epoch": 0.677128891175714, + "grad_norm": 0.19170524179935455, + "learning_rate": 1.4684994272623139e-05, + "loss": 0.2685, + "step": 329 + }, + { + "epoch": 0.6791870337020839, + "grad_norm": 0.1851339191198349, + "learning_rate": 1.466208476517755e-05, + "loss": 0.266, + "step": 330 + }, + { + "epoch": 0.6812451762284538, + "grad_norm": 0.1678062081336975, + "learning_rate": 1.4639175257731958e-05, + "loss": 0.2609, + "step": 331 + }, + { + "epoch": 0.6833033187548238, + "grad_norm": 0.17913252115249634, + "learning_rate": 1.4616265750286369e-05, + "loss": 0.2716, + "step": 332 + }, + { + "epoch": 0.6853614612811937, + "grad_norm": 0.1859239637851715, + "learning_rate": 1.459335624284078e-05, + "loss": 0.2712, + "step": 333 + }, + { + "epoch": 0.6874196038075637, + "grad_norm": 0.18390226364135742, + "learning_rate": 1.457044673539519e-05, + "loss": 0.2827, + "step": 334 + }, + { + "epoch": 0.6894777463339337, + "grad_norm": 0.18520398437976837, + "learning_rate": 1.4547537227949599e-05, + "loss": 0.2721, + "step": 335 + }, + { + "epoch": 0.6915358888603036, + "grad_norm": 0.18416717648506165, + "learning_rate": 1.452462772050401e-05, + "loss": 0.2683, + "step": 336 + }, + { + "epoch": 0.6935940313866735, + "grad_norm": 0.18727894127368927, + "learning_rate": 1.450171821305842e-05, + "loss": 0.2733, + "step": 337 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.18597093224525452, + "learning_rate": 1.447880870561283e-05, + "loss": 0.2708, + "step": 338 + }, + { + "epoch": 0.6977103164394134, + "grad_norm": 0.1786068081855774, + "learning_rate": 1.445589919816724e-05, + "loss": 0.2667, + "step": 339 + }, + { + "epoch": 0.6997684589657834, + "grad_norm": 0.17466600239276886, + "learning_rate": 1.443298969072165e-05, + "loss": 0.2786, + "step": 340 + }, + { + "epoch": 0.7018266014921534, + "grad_norm": 0.185857355594635, + "learning_rate": 1.4410080183276061e-05, + "loss": 0.2759, + "step": 341 + }, + { + "epoch": 0.7038847440185233, + "grad_norm": 0.2004527747631073, + "learning_rate": 1.4387170675830471e-05, + "loss": 0.2847, + "step": 342 + }, + { + "epoch": 0.7059428865448932, + "grad_norm": 0.18774060904979706, + "learning_rate": 1.436426116838488e-05, + "loss": 0.2766, + "step": 343 + }, + { + "epoch": 0.7080010290712632, + "grad_norm": 0.1840328425168991, + "learning_rate": 1.4341351660939291e-05, + "loss": 0.2722, + "step": 344 + }, + { + "epoch": 0.7100591715976331, + "grad_norm": 0.19089624285697937, + "learning_rate": 1.4318442153493702e-05, + "loss": 0.2779, + "step": 345 + }, + { + "epoch": 0.7121173141240031, + "grad_norm": 0.1848018616437912, + "learning_rate": 1.4295532646048112e-05, + "loss": 0.2739, + "step": 346 + }, + { + "epoch": 0.7141754566503731, + "grad_norm": 0.18844038248062134, + "learning_rate": 1.4272623138602521e-05, + "loss": 0.27, + "step": 347 + }, + { + "epoch": 0.716233599176743, + "grad_norm": 0.19289302825927734, + "learning_rate": 1.4249713631156932e-05, + "loss": 0.2743, + "step": 348 + }, + { + "epoch": 0.7182917417031129, + "grad_norm": 0.18738920986652374, + "learning_rate": 1.4226804123711342e-05, + "loss": 0.2657, + "step": 349 + }, + { + "epoch": 0.7203498842294829, + "grad_norm": 0.1925181746482849, + "learning_rate": 1.4203894616265753e-05, + "loss": 0.2637, + "step": 350 + }, + { + "epoch": 0.7224080267558528, + "grad_norm": 0.19114750623703003, + "learning_rate": 1.4180985108820162e-05, + "loss": 0.2758, + "step": 351 + }, + { + "epoch": 0.7244661692822228, + "grad_norm": 0.18310120701789856, + "learning_rate": 1.4158075601374572e-05, + "loss": 0.2777, + "step": 352 + }, + { + "epoch": 0.7265243118085928, + "grad_norm": 0.2045605331659317, + "learning_rate": 1.4135166093928983e-05, + "loss": 0.2653, + "step": 353 + }, + { + "epoch": 0.7285824543349627, + "grad_norm": 0.1856454759836197, + "learning_rate": 1.4112256586483393e-05, + "loss": 0.267, + "step": 354 + }, + { + "epoch": 0.7306405968613326, + "grad_norm": 0.1855366826057434, + "learning_rate": 1.4089347079037802e-05, + "loss": 0.2805, + "step": 355 + }, + { + "epoch": 0.7326987393877026, + "grad_norm": 0.17913414537906647, + "learning_rate": 1.4066437571592213e-05, + "loss": 0.2755, + "step": 356 + }, + { + "epoch": 0.7347568819140725, + "grad_norm": 0.2057684361934662, + "learning_rate": 1.404352806414662e-05, + "loss": 0.2668, + "step": 357 + }, + { + "epoch": 0.7368150244404424, + "grad_norm": 0.190156951546669, + "learning_rate": 1.402061855670103e-05, + "loss": 0.2778, + "step": 358 + }, + { + "epoch": 0.7388731669668125, + "grad_norm": 0.19387219846248627, + "learning_rate": 1.3997709049255441e-05, + "loss": 0.2785, + "step": 359 + }, + { + "epoch": 0.7409313094931824, + "grad_norm": 0.1933836042881012, + "learning_rate": 1.3974799541809852e-05, + "loss": 0.2661, + "step": 360 + }, + { + "epoch": 0.7429894520195524, + "grad_norm": 0.19618812203407288, + "learning_rate": 1.3951890034364261e-05, + "loss": 0.2622, + "step": 361 + }, + { + "epoch": 0.7450475945459223, + "grad_norm": 0.18786942958831787, + "learning_rate": 1.3928980526918671e-05, + "loss": 0.2695, + "step": 362 + }, + { + "epoch": 0.7471057370722922, + "grad_norm": 0.19361330568790436, + "learning_rate": 1.3906071019473082e-05, + "loss": 0.2869, + "step": 363 + }, + { + "epoch": 0.7491638795986622, + "grad_norm": 0.19813291728496552, + "learning_rate": 1.3883161512027493e-05, + "loss": 0.2753, + "step": 364 + }, + { + "epoch": 0.7512220221250322, + "grad_norm": 0.1891734004020691, + "learning_rate": 1.3860252004581902e-05, + "loss": 0.2694, + "step": 365 + }, + { + "epoch": 0.7532801646514021, + "grad_norm": 0.18902742862701416, + "learning_rate": 1.3837342497136312e-05, + "loss": 0.2675, + "step": 366 + }, + { + "epoch": 0.7553383071777721, + "grad_norm": 0.19838480651378632, + "learning_rate": 1.3814432989690723e-05, + "loss": 0.2721, + "step": 367 + }, + { + "epoch": 0.757396449704142, + "grad_norm": 0.20880939066410065, + "learning_rate": 1.3791523482245133e-05, + "loss": 0.2641, + "step": 368 + }, + { + "epoch": 0.7594545922305119, + "grad_norm": 0.20068003237247467, + "learning_rate": 1.3768613974799542e-05, + "loss": 0.2945, + "step": 369 + }, + { + "epoch": 0.7615127347568819, + "grad_norm": 0.19780132174491882, + "learning_rate": 1.3745704467353953e-05, + "loss": 0.2687, + "step": 370 + }, + { + "epoch": 0.7635708772832519, + "grad_norm": 0.19194689393043518, + "learning_rate": 1.3722794959908363e-05, + "loss": 0.2731, + "step": 371 + }, + { + "epoch": 0.7656290198096218, + "grad_norm": 0.19504573941230774, + "learning_rate": 1.3699885452462774e-05, + "loss": 0.2551, + "step": 372 + }, + { + "epoch": 0.7676871623359918, + "grad_norm": 0.18304413557052612, + "learning_rate": 1.3676975945017183e-05, + "loss": 0.2692, + "step": 373 + }, + { + "epoch": 0.7697453048623617, + "grad_norm": 0.2051483392715454, + "learning_rate": 1.3654066437571593e-05, + "loss": 0.2791, + "step": 374 + }, + { + "epoch": 0.7718034473887316, + "grad_norm": 0.18748973309993744, + "learning_rate": 1.3631156930126004e-05, + "loss": 0.2671, + "step": 375 + }, + { + "epoch": 0.7738615899151016, + "grad_norm": 0.19167177379131317, + "learning_rate": 1.3608247422680415e-05, + "loss": 0.2766, + "step": 376 + }, + { + "epoch": 0.7759197324414716, + "grad_norm": 0.17931750416755676, + "learning_rate": 1.3585337915234824e-05, + "loss": 0.2748, + "step": 377 + }, + { + "epoch": 0.7779778749678415, + "grad_norm": 0.19437509775161743, + "learning_rate": 1.3562428407789234e-05, + "loss": 0.2667, + "step": 378 + }, + { + "epoch": 0.7800360174942115, + "grad_norm": 0.19813868403434753, + "learning_rate": 1.3539518900343645e-05, + "loss": 0.2771, + "step": 379 + }, + { + "epoch": 0.7820941600205814, + "grad_norm": 0.19205260276794434, + "learning_rate": 1.3516609392898055e-05, + "loss": 0.2703, + "step": 380 + }, + { + "epoch": 0.7841523025469513, + "grad_norm": 0.19039763510227203, + "learning_rate": 1.3493699885452464e-05, + "loss": 0.264, + "step": 381 + }, + { + "epoch": 0.7862104450733213, + "grad_norm": 0.18269500136375427, + "learning_rate": 1.3470790378006875e-05, + "loss": 0.2653, + "step": 382 + }, + { + "epoch": 0.7882685875996913, + "grad_norm": 0.1922067403793335, + "learning_rate": 1.3447880870561285e-05, + "loss": 0.2754, + "step": 383 + }, + { + "epoch": 0.7903267301260612, + "grad_norm": 0.19615666568279266, + "learning_rate": 1.3424971363115693e-05, + "loss": 0.2811, + "step": 384 + }, + { + "epoch": 0.7923848726524312, + "grad_norm": 0.19037973880767822, + "learning_rate": 1.3402061855670103e-05, + "loss": 0.2673, + "step": 385 + }, + { + "epoch": 0.7944430151788011, + "grad_norm": 0.191124826669693, + "learning_rate": 1.3379152348224514e-05, + "loss": 0.2683, + "step": 386 + }, + { + "epoch": 0.796501157705171, + "grad_norm": 0.18429923057556152, + "learning_rate": 1.3356242840778923e-05, + "loss": 0.2698, + "step": 387 + }, + { + "epoch": 0.798559300231541, + "grad_norm": 0.1839045137166977, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.2895, + "step": 388 + }, + { + "epoch": 0.800617442757911, + "grad_norm": 0.1944131702184677, + "learning_rate": 1.3310423825887744e-05, + "loss": 0.2641, + "step": 389 + }, + { + "epoch": 0.802675585284281, + "grad_norm": 0.20407740771770477, + "learning_rate": 1.3287514318442154e-05, + "loss": 0.2743, + "step": 390 + }, + { + "epoch": 0.8047337278106509, + "grad_norm": 0.1814037561416626, + "learning_rate": 1.3264604810996563e-05, + "loss": 0.2672, + "step": 391 + }, + { + "epoch": 0.8067918703370208, + "grad_norm": 0.1886950582265854, + "learning_rate": 1.3241695303550974e-05, + "loss": 0.2725, + "step": 392 + }, + { + "epoch": 0.8088500128633908, + "grad_norm": 0.19429941475391388, + "learning_rate": 1.3218785796105385e-05, + "loss": 0.2669, + "step": 393 + }, + { + "epoch": 0.8109081553897607, + "grad_norm": 0.19143058359622955, + "learning_rate": 1.3195876288659795e-05, + "loss": 0.2659, + "step": 394 + }, + { + "epoch": 0.8129662979161307, + "grad_norm": 0.2213468849658966, + "learning_rate": 1.3172966781214204e-05, + "loss": 0.2764, + "step": 395 + }, + { + "epoch": 0.8150244404425007, + "grad_norm": 0.2040800005197525, + "learning_rate": 1.3150057273768615e-05, + "loss": 0.2783, + "step": 396 + }, + { + "epoch": 0.8170825829688706, + "grad_norm": 0.1948375254869461, + "learning_rate": 1.3127147766323025e-05, + "loss": 0.2689, + "step": 397 + }, + { + "epoch": 0.8191407254952405, + "grad_norm": 0.1915021538734436, + "learning_rate": 1.3104238258877436e-05, + "loss": 0.2808, + "step": 398 + }, + { + "epoch": 0.8211988680216105, + "grad_norm": 0.19760248064994812, + "learning_rate": 1.3081328751431845e-05, + "loss": 0.2712, + "step": 399 + }, + { + "epoch": 0.8232570105479804, + "grad_norm": 0.2082677185535431, + "learning_rate": 1.3058419243986255e-05, + "loss": 0.2707, + "step": 400 + }, + { + "epoch": 0.8232570105479804, + "eval_loss": 0.28778496384620667, + "eval_runtime": 2427.9096, + "eval_samples_per_second": 3.202, + "eval_steps_per_second": 0.801, + "step": 400 + }, + { + "epoch": 0.8253151530743504, + "grad_norm": 0.19694332778453827, + "learning_rate": 1.3035509736540666e-05, + "loss": 0.2801, + "step": 401 + }, + { + "epoch": 0.8273732956007204, + "grad_norm": 0.19448824226856232, + "learning_rate": 1.3012600229095077e-05, + "loss": 0.2632, + "step": 402 + }, + { + "epoch": 0.8294314381270903, + "grad_norm": 0.18745476007461548, + "learning_rate": 1.2989690721649485e-05, + "loss": 0.2773, + "step": 403 + }, + { + "epoch": 0.8314895806534602, + "grad_norm": 0.19524575769901276, + "learning_rate": 1.2966781214203896e-05, + "loss": 0.2594, + "step": 404 + }, + { + "epoch": 0.8335477231798302, + "grad_norm": 0.19612252712249756, + "learning_rate": 1.2943871706758307e-05, + "loss": 0.271, + "step": 405 + }, + { + "epoch": 0.8356058657062001, + "grad_norm": 0.19964493811130524, + "learning_rate": 1.2920962199312717e-05, + "loss": 0.2615, + "step": 406 + }, + { + "epoch": 0.8376640082325701, + "grad_norm": 0.20115099847316742, + "learning_rate": 1.2898052691867126e-05, + "loss": 0.269, + "step": 407 + }, + { + "epoch": 0.8397221507589401, + "grad_norm": 0.18949687480926514, + "learning_rate": 1.2875143184421537e-05, + "loss": 0.2649, + "step": 408 + }, + { + "epoch": 0.84178029328531, + "grad_norm": 0.1931927353143692, + "learning_rate": 1.2852233676975947e-05, + "loss": 0.2611, + "step": 409 + }, + { + "epoch": 0.8438384358116799, + "grad_norm": 0.18723614513874054, + "learning_rate": 1.2829324169530358e-05, + "loss": 0.2699, + "step": 410 + }, + { + "epoch": 0.8458965783380499, + "grad_norm": 0.19405977427959442, + "learning_rate": 1.2806414662084765e-05, + "loss": 0.2691, + "step": 411 + }, + { + "epoch": 0.8479547208644198, + "grad_norm": 0.2021879404783249, + "learning_rate": 1.2783505154639176e-05, + "loss": 0.267, + "step": 412 + }, + { + "epoch": 0.8500128633907899, + "grad_norm": 0.20015574991703033, + "learning_rate": 1.2760595647193586e-05, + "loss": 0.2632, + "step": 413 + }, + { + "epoch": 0.8520710059171598, + "grad_norm": 0.19090059399604797, + "learning_rate": 1.2737686139747995e-05, + "loss": 0.2743, + "step": 414 + }, + { + "epoch": 0.8541291484435297, + "grad_norm": 0.1906920224428177, + "learning_rate": 1.2714776632302406e-05, + "loss": 0.2723, + "step": 415 + }, + { + "epoch": 0.8561872909698997, + "grad_norm": 0.19348129630088806, + "learning_rate": 1.2691867124856816e-05, + "loss": 0.2656, + "step": 416 + }, + { + "epoch": 0.8582454334962696, + "grad_norm": 0.18771213293075562, + "learning_rate": 1.2668957617411227e-05, + "loss": 0.2617, + "step": 417 + }, + { + "epoch": 0.8603035760226395, + "grad_norm": 0.2135135382413864, + "learning_rate": 1.2646048109965636e-05, + "loss": 0.2773, + "step": 418 + }, + { + "epoch": 0.8623617185490096, + "grad_norm": 0.19689443707466125, + "learning_rate": 1.2623138602520046e-05, + "loss": 0.2623, + "step": 419 + }, + { + "epoch": 0.8644198610753795, + "grad_norm": 0.18752440810203552, + "learning_rate": 1.2600229095074457e-05, + "loss": 0.2599, + "step": 420 + }, + { + "epoch": 0.8664780036017494, + "grad_norm": 0.19264395534992218, + "learning_rate": 1.2577319587628866e-05, + "loss": 0.2707, + "step": 421 + }, + { + "epoch": 0.8685361461281194, + "grad_norm": 0.19980797171592712, + "learning_rate": 1.2554410080183277e-05, + "loss": 0.2616, + "step": 422 + }, + { + "epoch": 0.8705942886544893, + "grad_norm": 0.22940242290496826, + "learning_rate": 1.2531500572737687e-05, + "loss": 0.2712, + "step": 423 + }, + { + "epoch": 0.8726524311808592, + "grad_norm": 0.18825359642505646, + "learning_rate": 1.2508591065292098e-05, + "loss": 0.2779, + "step": 424 + }, + { + "epoch": 0.8747105737072293, + "grad_norm": 0.21553562581539154, + "learning_rate": 1.2485681557846507e-05, + "loss": 0.2677, + "step": 425 + }, + { + "epoch": 0.8767687162335992, + "grad_norm": 0.2025568038225174, + "learning_rate": 1.2462772050400917e-05, + "loss": 0.2659, + "step": 426 + }, + { + "epoch": 0.8788268587599691, + "grad_norm": 0.19179950654506683, + "learning_rate": 1.2439862542955328e-05, + "loss": 0.2762, + "step": 427 + }, + { + "epoch": 0.8808850012863391, + "grad_norm": 0.20982210338115692, + "learning_rate": 1.2416953035509738e-05, + "loss": 0.2648, + "step": 428 + }, + { + "epoch": 0.882943143812709, + "grad_norm": 0.2084280252456665, + "learning_rate": 1.2394043528064147e-05, + "loss": 0.2806, + "step": 429 + }, + { + "epoch": 0.8850012863390789, + "grad_norm": 0.1993308663368225, + "learning_rate": 1.2371134020618558e-05, + "loss": 0.2673, + "step": 430 + }, + { + "epoch": 0.887059428865449, + "grad_norm": 0.1917535811662674, + "learning_rate": 1.2348224513172968e-05, + "loss": 0.2596, + "step": 431 + }, + { + "epoch": 0.8891175713918189, + "grad_norm": 0.18980742990970612, + "learning_rate": 1.2325315005727379e-05, + "loss": 0.2607, + "step": 432 + }, + { + "epoch": 0.8911757139181888, + "grad_norm": 0.21062685549259186, + "learning_rate": 1.2302405498281788e-05, + "loss": 0.2612, + "step": 433 + }, + { + "epoch": 0.8932338564445588, + "grad_norm": 0.20591405034065247, + "learning_rate": 1.2279495990836199e-05, + "loss": 0.2698, + "step": 434 + }, + { + "epoch": 0.8952919989709287, + "grad_norm": 0.2052398920059204, + "learning_rate": 1.2256586483390609e-05, + "loss": 0.2673, + "step": 435 + }, + { + "epoch": 0.8973501414972986, + "grad_norm": 0.19963452219963074, + "learning_rate": 1.223367697594502e-05, + "loss": 0.266, + "step": 436 + }, + { + "epoch": 0.8994082840236687, + "grad_norm": 0.1929163783788681, + "learning_rate": 1.2210767468499429e-05, + "loss": 0.2605, + "step": 437 + }, + { + "epoch": 0.9014664265500386, + "grad_norm": 0.19121681153774261, + "learning_rate": 1.218785796105384e-05, + "loss": 0.2642, + "step": 438 + }, + { + "epoch": 0.9035245690764085, + "grad_norm": 0.18931221961975098, + "learning_rate": 1.2164948453608248e-05, + "loss": 0.2653, + "step": 439 + }, + { + "epoch": 0.9055827116027785, + "grad_norm": 0.21359370648860931, + "learning_rate": 1.2142038946162657e-05, + "loss": 0.264, + "step": 440 + }, + { + "epoch": 0.9076408541291484, + "grad_norm": 0.1874193251132965, + "learning_rate": 1.2119129438717068e-05, + "loss": 0.2664, + "step": 441 + }, + { + "epoch": 0.9096989966555183, + "grad_norm": 0.19697226583957672, + "learning_rate": 1.2096219931271478e-05, + "loss": 0.2651, + "step": 442 + }, + { + "epoch": 0.9117571391818884, + "grad_norm": 0.20930957794189453, + "learning_rate": 1.2073310423825889e-05, + "loss": 0.2724, + "step": 443 + }, + { + "epoch": 0.9138152817082583, + "grad_norm": 0.19588977098464966, + "learning_rate": 1.2050400916380298e-05, + "loss": 0.2648, + "step": 444 + }, + { + "epoch": 0.9158734242346283, + "grad_norm": 0.19452017545700073, + "learning_rate": 1.2027491408934708e-05, + "loss": 0.2808, + "step": 445 + }, + { + "epoch": 0.9179315667609982, + "grad_norm": 0.19226408004760742, + "learning_rate": 1.2004581901489119e-05, + "loss": 0.2627, + "step": 446 + }, + { + "epoch": 0.9199897092873681, + "grad_norm": 0.18108274042606354, + "learning_rate": 1.198167239404353e-05, + "loss": 0.2693, + "step": 447 + }, + { + "epoch": 0.922047851813738, + "grad_norm": 0.19352363049983978, + "learning_rate": 1.1958762886597938e-05, + "loss": 0.2705, + "step": 448 + }, + { + "epoch": 0.9241059943401081, + "grad_norm": 0.18535122275352478, + "learning_rate": 1.1935853379152349e-05, + "loss": 0.2608, + "step": 449 + }, + { + "epoch": 0.926164136866478, + "grad_norm": 0.19209617376327515, + "learning_rate": 1.191294387170676e-05, + "loss": 0.2702, + "step": 450 + }, + { + "epoch": 0.928222279392848, + "grad_norm": 0.1866796910762787, + "learning_rate": 1.189003436426117e-05, + "loss": 0.264, + "step": 451 + }, + { + "epoch": 0.9302804219192179, + "grad_norm": 0.21708665788173676, + "learning_rate": 1.1867124856815579e-05, + "loss": 0.2693, + "step": 452 + }, + { + "epoch": 0.9323385644455878, + "grad_norm": 0.19297796487808228, + "learning_rate": 1.184421534936999e-05, + "loss": 0.2745, + "step": 453 + }, + { + "epoch": 0.9343967069719578, + "grad_norm": 0.19070400297641754, + "learning_rate": 1.18213058419244e-05, + "loss": 0.265, + "step": 454 + }, + { + "epoch": 0.9364548494983278, + "grad_norm": 0.19821566343307495, + "learning_rate": 1.1798396334478809e-05, + "loss": 0.2674, + "step": 455 + }, + { + "epoch": 0.9385129920246977, + "grad_norm": 0.2032192200422287, + "learning_rate": 1.177548682703322e-05, + "loss": 0.276, + "step": 456 + }, + { + "epoch": 0.9405711345510677, + "grad_norm": 0.19127750396728516, + "learning_rate": 1.175257731958763e-05, + "loss": 0.2696, + "step": 457 + }, + { + "epoch": 0.9426292770774376, + "grad_norm": 0.19187286496162415, + "learning_rate": 1.1729667812142041e-05, + "loss": 0.2601, + "step": 458 + }, + { + "epoch": 0.9446874196038075, + "grad_norm": 0.20871371030807495, + "learning_rate": 1.170675830469645e-05, + "loss": 0.2687, + "step": 459 + }, + { + "epoch": 0.9467455621301775, + "grad_norm": 0.19228306412696838, + "learning_rate": 1.168384879725086e-05, + "loss": 0.2633, + "step": 460 + }, + { + "epoch": 0.9488037046565475, + "grad_norm": 0.19025444984436035, + "learning_rate": 1.1660939289805271e-05, + "loss": 0.2721, + "step": 461 + }, + { + "epoch": 0.9508618471829174, + "grad_norm": 0.19476914405822754, + "learning_rate": 1.1638029782359682e-05, + "loss": 0.2662, + "step": 462 + }, + { + "epoch": 0.9529199897092874, + "grad_norm": 0.1991666853427887, + "learning_rate": 1.161512027491409e-05, + "loss": 0.269, + "step": 463 + }, + { + "epoch": 0.9549781322356573, + "grad_norm": 0.19385920464992523, + "learning_rate": 1.1592210767468501e-05, + "loss": 0.2647, + "step": 464 + }, + { + "epoch": 0.9570362747620272, + "grad_norm": 0.1911603957414627, + "learning_rate": 1.1569301260022912e-05, + "loss": 0.2679, + "step": 465 + }, + { + "epoch": 0.9590944172883972, + "grad_norm": 0.20373377203941345, + "learning_rate": 1.1546391752577319e-05, + "loss": 0.2694, + "step": 466 + }, + { + "epoch": 0.9611525598147672, + "grad_norm": 0.20550350844860077, + "learning_rate": 1.152348224513173e-05, + "loss": 0.2677, + "step": 467 + }, + { + "epoch": 0.9632107023411371, + "grad_norm": 0.2049354463815689, + "learning_rate": 1.150057273768614e-05, + "loss": 0.2752, + "step": 468 + }, + { + "epoch": 0.9652688448675071, + "grad_norm": 0.21691595017910004, + "learning_rate": 1.147766323024055e-05, + "loss": 0.2727, + "step": 469 + }, + { + "epoch": 0.967326987393877, + "grad_norm": 0.20727306604385376, + "learning_rate": 1.145475372279496e-05, + "loss": 0.2575, + "step": 470 + }, + { + "epoch": 0.969385129920247, + "grad_norm": 0.19166423380374908, + "learning_rate": 1.143184421534937e-05, + "loss": 0.2716, + "step": 471 + }, + { + "epoch": 0.9714432724466169, + "grad_norm": 0.18833886086940765, + "learning_rate": 1.140893470790378e-05, + "loss": 0.2651, + "step": 472 + }, + { + "epoch": 0.9735014149729869, + "grad_norm": 0.19680088758468628, + "learning_rate": 1.1386025200458191e-05, + "loss": 0.2621, + "step": 473 + }, + { + "epoch": 0.9755595574993569, + "grad_norm": 0.20966476202011108, + "learning_rate": 1.13631156930126e-05, + "loss": 0.2725, + "step": 474 + }, + { + "epoch": 0.9776177000257268, + "grad_norm": 0.1963450163602829, + "learning_rate": 1.134020618556701e-05, + "loss": 0.2569, + "step": 475 + }, + { + "epoch": 0.9796758425520967, + "grad_norm": 0.21289944648742676, + "learning_rate": 1.1317296678121421e-05, + "loss": 0.2622, + "step": 476 + }, + { + "epoch": 0.9817339850784667, + "grad_norm": 0.2103341966867447, + "learning_rate": 1.1294387170675832e-05, + "loss": 0.2803, + "step": 477 + }, + { + "epoch": 0.9837921276048366, + "grad_norm": 0.20202945172786713, + "learning_rate": 1.1271477663230241e-05, + "loss": 0.273, + "step": 478 + }, + { + "epoch": 0.9858502701312066, + "grad_norm": 0.18241006135940552, + "learning_rate": 1.1248568155784651e-05, + "loss": 0.2721, + "step": 479 + }, + { + "epoch": 0.9879084126575766, + "grad_norm": 0.19221259653568268, + "learning_rate": 1.1225658648339062e-05, + "loss": 0.2646, + "step": 480 + }, + { + "epoch": 0.9899665551839465, + "grad_norm": 0.19371837377548218, + "learning_rate": 1.1202749140893473e-05, + "loss": 0.2519, + "step": 481 + }, + { + "epoch": 0.9920246977103164, + "grad_norm": 0.1972094029188156, + "learning_rate": 1.1179839633447882e-05, + "loss": 0.2555, + "step": 482 + }, + { + "epoch": 0.9940828402366864, + "grad_norm": 0.19414126873016357, + "learning_rate": 1.1156930126002292e-05, + "loss": 0.2726, + "step": 483 + }, + { + "epoch": 0.9961409827630563, + "grad_norm": 0.18993492424488068, + "learning_rate": 1.1134020618556703e-05, + "loss": 0.2644, + "step": 484 + }, + { + "epoch": 0.9981991252894263, + "grad_norm": 0.19713927805423737, + "learning_rate": 1.1111111111111113e-05, + "loss": 0.2569, + "step": 485 + }, + { + "epoch": 1.00205814252637, + "grad_norm": 0.3423589766025543, + "learning_rate": 1.1088201603665522e-05, + "loss": 0.5285, + "step": 486 + }, + { + "epoch": 1.0041162850527399, + "grad_norm": 0.1901763528585434, + "learning_rate": 1.1065292096219933e-05, + "loss": 0.2621, + "step": 487 + }, + { + "epoch": 1.0061744275791098, + "grad_norm": 0.20508776605129242, + "learning_rate": 1.1042382588774343e-05, + "loss": 0.2665, + "step": 488 + }, + { + "epoch": 1.0082325701054797, + "grad_norm": 0.20188146829605103, + "learning_rate": 1.1019473081328752e-05, + "loss": 0.2547, + "step": 489 + }, + { + "epoch": 1.0102907126318497, + "grad_norm": 0.20245613157749176, + "learning_rate": 1.0996563573883163e-05, + "loss": 0.2657, + "step": 490 + }, + { + "epoch": 1.0123488551582196, + "grad_norm": 0.19711382687091827, + "learning_rate": 1.0973654066437574e-05, + "loss": 0.2597, + "step": 491 + }, + { + "epoch": 1.0144069976845898, + "grad_norm": 0.21538953483104706, + "learning_rate": 1.0950744558991984e-05, + "loss": 0.2727, + "step": 492 + }, + { + "epoch": 1.0164651402109597, + "grad_norm": 0.20296984910964966, + "learning_rate": 1.0927835051546391e-05, + "loss": 0.2634, + "step": 493 + }, + { + "epoch": 1.0185232827373296, + "grad_norm": 0.20134592056274414, + "learning_rate": 1.0904925544100802e-05, + "loss": 0.2596, + "step": 494 + }, + { + "epoch": 1.0205814252636995, + "grad_norm": 0.200101837515831, + "learning_rate": 1.0882016036655212e-05, + "loss": 0.2575, + "step": 495 + }, + { + "epoch": 1.0226395677900695, + "grad_norm": 0.19144928455352783, + "learning_rate": 1.0859106529209621e-05, + "loss": 0.263, + "step": 496 + }, + { + "epoch": 1.0246977103164394, + "grad_norm": 0.19832482933998108, + "learning_rate": 1.0836197021764032e-05, + "loss": 0.2656, + "step": 497 + }, + { + "epoch": 1.0267558528428093, + "grad_norm": 0.20965202152729034, + "learning_rate": 1.0813287514318443e-05, + "loss": 0.2611, + "step": 498 + }, + { + "epoch": 1.0288139953691793, + "grad_norm": 0.1974337100982666, + "learning_rate": 1.0790378006872853e-05, + "loss": 0.2667, + "step": 499 + }, + { + "epoch": 1.0308721378955492, + "grad_norm": 0.20611713826656342, + "learning_rate": 1.0767468499427262e-05, + "loss": 0.2674, + "step": 500 + }, + { + "epoch": 1.0308721378955492, + "eval_loss": 0.2836935222148895, + "eval_runtime": 2423.44, + "eval_samples_per_second": 3.208, + "eval_steps_per_second": 0.802, + "step": 500 + }, + { + "epoch": 1.0329302804219191, + "grad_norm": 0.202958345413208, + "learning_rate": 1.0744558991981673e-05, + "loss": 0.2684, + "step": 501 + }, + { + "epoch": 1.034988422948289, + "grad_norm": 0.1984429508447647, + "learning_rate": 1.0721649484536083e-05, + "loss": 0.2557, + "step": 502 + }, + { + "epoch": 1.0370465654746592, + "grad_norm": 0.19396482408046722, + "learning_rate": 1.0698739977090494e-05, + "loss": 0.255, + "step": 503 + }, + { + "epoch": 1.0391047080010292, + "grad_norm": 0.19176840782165527, + "learning_rate": 1.0675830469644903e-05, + "loss": 0.2675, + "step": 504 + }, + { + "epoch": 1.041162850527399, + "grad_norm": 0.20167966187000275, + "learning_rate": 1.0652920962199313e-05, + "loss": 0.2669, + "step": 505 + }, + { + "epoch": 1.043220993053769, + "grad_norm": 0.2049783617258072, + "learning_rate": 1.0630011454753724e-05, + "loss": 0.2446, + "step": 506 + }, + { + "epoch": 1.045279135580139, + "grad_norm": 0.19293472170829773, + "learning_rate": 1.0607101947308135e-05, + "loss": 0.256, + "step": 507 + }, + { + "epoch": 1.047337278106509, + "grad_norm": 0.19432370364665985, + "learning_rate": 1.0584192439862543e-05, + "loss": 0.2605, + "step": 508 + }, + { + "epoch": 1.0493954206328788, + "grad_norm": 0.19784876704216003, + "learning_rate": 1.0561282932416954e-05, + "loss": 0.2617, + "step": 509 + }, + { + "epoch": 1.0514535631592488, + "grad_norm": 0.19982090592384338, + "learning_rate": 1.0538373424971365e-05, + "loss": 0.264, + "step": 510 + }, + { + "epoch": 1.0535117056856187, + "grad_norm": 0.2019587755203247, + "learning_rate": 1.0515463917525775e-05, + "loss": 0.2543, + "step": 511 + }, + { + "epoch": 1.0555698482119886, + "grad_norm": 0.19848807156085968, + "learning_rate": 1.0492554410080184e-05, + "loss": 0.2613, + "step": 512 + }, + { + "epoch": 1.0576279907383586, + "grad_norm": 0.20360374450683594, + "learning_rate": 1.0469644902634595e-05, + "loss": 0.2675, + "step": 513 + }, + { + "epoch": 1.0596861332647285, + "grad_norm": 0.19209840893745422, + "learning_rate": 1.0446735395189005e-05, + "loss": 0.2517, + "step": 514 + }, + { + "epoch": 1.0617442757910984, + "grad_norm": 0.19142381846904755, + "learning_rate": 1.0423825887743416e-05, + "loss": 0.2631, + "step": 515 + }, + { + "epoch": 1.0638024183174686, + "grad_norm": 0.20222575962543488, + "learning_rate": 1.0400916380297825e-05, + "loss": 0.2625, + "step": 516 + }, + { + "epoch": 1.0658605608438385, + "grad_norm": 0.1984448879957199, + "learning_rate": 1.0378006872852235e-05, + "loss": 0.2584, + "step": 517 + }, + { + "epoch": 1.0679187033702084, + "grad_norm": 0.1992885023355484, + "learning_rate": 1.0355097365406646e-05, + "loss": 0.2609, + "step": 518 + }, + { + "epoch": 1.0699768458965784, + "grad_norm": 0.20708978176116943, + "learning_rate": 1.0332187857961057e-05, + "loss": 0.2618, + "step": 519 + }, + { + "epoch": 1.0720349884229483, + "grad_norm": 0.22806766629219055, + "learning_rate": 1.0309278350515464e-05, + "loss": 0.2634, + "step": 520 + }, + { + "epoch": 1.0740931309493182, + "grad_norm": 0.2019941806793213, + "learning_rate": 1.0286368843069874e-05, + "loss": 0.2588, + "step": 521 + }, + { + "epoch": 1.0761512734756882, + "grad_norm": 0.19460470974445343, + "learning_rate": 1.0263459335624283e-05, + "loss": 0.2692, + "step": 522 + }, + { + "epoch": 1.078209416002058, + "grad_norm": 0.19483187794685364, + "learning_rate": 1.0240549828178694e-05, + "loss": 0.2474, + "step": 523 + }, + { + "epoch": 1.080267558528428, + "grad_norm": 0.2199576050043106, + "learning_rate": 1.0217640320733104e-05, + "loss": 0.2582, + "step": 524 + }, + { + "epoch": 1.082325701054798, + "grad_norm": 0.20485302805900574, + "learning_rate": 1.0194730813287515e-05, + "loss": 0.2463, + "step": 525 + }, + { + "epoch": 1.084383843581168, + "grad_norm": 0.20773454010486603, + "learning_rate": 1.0171821305841924e-05, + "loss": 0.2501, + "step": 526 + }, + { + "epoch": 1.086441986107538, + "grad_norm": 0.19593262672424316, + "learning_rate": 1.0148911798396335e-05, + "loss": 0.2608, + "step": 527 + }, + { + "epoch": 1.088500128633908, + "grad_norm": 0.20500554144382477, + "learning_rate": 1.0126002290950745e-05, + "loss": 0.2586, + "step": 528 + }, + { + "epoch": 1.090558271160278, + "grad_norm": 0.19919747114181519, + "learning_rate": 1.0103092783505156e-05, + "loss": 0.2724, + "step": 529 + }, + { + "epoch": 1.0926164136866479, + "grad_norm": 0.1953326314687729, + "learning_rate": 1.0080183276059565e-05, + "loss": 0.2456, + "step": 530 + }, + { + "epoch": 1.0946745562130178, + "grad_norm": 0.2155047059059143, + "learning_rate": 1.0057273768613975e-05, + "loss": 0.2644, + "step": 531 + }, + { + "epoch": 1.0967326987393877, + "grad_norm": 0.19747495651245117, + "learning_rate": 1.0034364261168386e-05, + "loss": 0.2539, + "step": 532 + }, + { + "epoch": 1.0987908412657577, + "grad_norm": 0.20261652767658234, + "learning_rate": 1.0011454753722796e-05, + "loss": 0.255, + "step": 533 + }, + { + "epoch": 1.1008489837921276, + "grad_norm": 0.19529719650745392, + "learning_rate": 9.988545246277205e-06, + "loss": 0.2489, + "step": 534 + }, + { + "epoch": 1.1029071263184975, + "grad_norm": 0.20239490270614624, + "learning_rate": 9.965635738831616e-06, + "loss": 0.2664, + "step": 535 + }, + { + "epoch": 1.1049652688448675, + "grad_norm": 0.19377024471759796, + "learning_rate": 9.942726231386026e-06, + "loss": 0.2615, + "step": 536 + }, + { + "epoch": 1.1070234113712374, + "grad_norm": 0.20523156225681305, + "learning_rate": 9.919816723940437e-06, + "loss": 0.2548, + "step": 537 + }, + { + "epoch": 1.1090815538976073, + "grad_norm": 0.2046228051185608, + "learning_rate": 9.896907216494846e-06, + "loss": 0.2704, + "step": 538 + }, + { + "epoch": 1.1111396964239773, + "grad_norm": 0.21209484338760376, + "learning_rate": 9.873997709049257e-06, + "loss": 0.2637, + "step": 539 + }, + { + "epoch": 1.1131978389503474, + "grad_norm": 0.20251420140266418, + "learning_rate": 9.851088201603667e-06, + "loss": 0.2617, + "step": 540 + }, + { + "epoch": 1.1152559814767173, + "grad_norm": 0.21695846319198608, + "learning_rate": 9.828178694158076e-06, + "loss": 0.2658, + "step": 541 + }, + { + "epoch": 1.1173141240030873, + "grad_norm": 0.2015303075313568, + "learning_rate": 9.805269186712487e-06, + "loss": 0.2528, + "step": 542 + }, + { + "epoch": 1.1193722665294572, + "grad_norm": 0.21796390414237976, + "learning_rate": 9.782359679266896e-06, + "loss": 0.2625, + "step": 543 + }, + { + "epoch": 1.1214304090558271, + "grad_norm": 0.20676304399967194, + "learning_rate": 9.759450171821306e-06, + "loss": 0.268, + "step": 544 + }, + { + "epoch": 1.123488551582197, + "grad_norm": 0.1986500769853592, + "learning_rate": 9.736540664375717e-06, + "loss": 0.2546, + "step": 545 + }, + { + "epoch": 1.125546694108567, + "grad_norm": 0.20008589327335358, + "learning_rate": 9.713631156930127e-06, + "loss": 0.2525, + "step": 546 + }, + { + "epoch": 1.127604836634937, + "grad_norm": 0.1891598105430603, + "learning_rate": 9.690721649484536e-06, + "loss": 0.256, + "step": 547 + }, + { + "epoch": 1.1296629791613069, + "grad_norm": 0.20968230068683624, + "learning_rate": 9.667812142038947e-06, + "loss": 0.2495, + "step": 548 + }, + { + "epoch": 1.1317211216876768, + "grad_norm": 0.2025834023952484, + "learning_rate": 9.644902634593357e-06, + "loss": 0.2533, + "step": 549 + }, + { + "epoch": 1.1337792642140467, + "grad_norm": 0.21087367832660675, + "learning_rate": 9.621993127147768e-06, + "loss": 0.2518, + "step": 550 + }, + { + "epoch": 1.1358374067404169, + "grad_norm": 0.20784996449947357, + "learning_rate": 9.599083619702177e-06, + "loss": 0.2594, + "step": 551 + }, + { + "epoch": 1.1378955492667868, + "grad_norm": 0.20754118263721466, + "learning_rate": 9.576174112256587e-06, + "loss": 0.2515, + "step": 552 + }, + { + "epoch": 1.1399536917931568, + "grad_norm": 0.225090891122818, + "learning_rate": 9.553264604810998e-06, + "loss": 0.2615, + "step": 553 + }, + { + "epoch": 1.1420118343195267, + "grad_norm": 0.24656590819358826, + "learning_rate": 9.530355097365407e-06, + "loss": 0.2636, + "step": 554 + }, + { + "epoch": 1.1440699768458966, + "grad_norm": 0.22454337775707245, + "learning_rate": 9.507445589919818e-06, + "loss": 0.2584, + "step": 555 + }, + { + "epoch": 1.1461281193722666, + "grad_norm": 0.2229425013065338, + "learning_rate": 9.484536082474226e-06, + "loss": 0.2543, + "step": 556 + }, + { + "epoch": 1.1481862618986365, + "grad_norm": 0.18805071711540222, + "learning_rate": 9.461626575028637e-06, + "loss": 0.2593, + "step": 557 + }, + { + "epoch": 1.1502444044250064, + "grad_norm": 0.23163346946239471, + "learning_rate": 9.438717067583048e-06, + "loss": 0.2537, + "step": 558 + }, + { + "epoch": 1.1523025469513763, + "grad_norm": 0.2126983255147934, + "learning_rate": 9.415807560137458e-06, + "loss": 0.2598, + "step": 559 + }, + { + "epoch": 1.1543606894777463, + "grad_norm": 0.2113332748413086, + "learning_rate": 9.392898052691867e-06, + "loss": 0.2617, + "step": 560 + }, + { + "epoch": 1.1564188320041162, + "grad_norm": 0.2220505177974701, + "learning_rate": 9.369988545246278e-06, + "loss": 0.2673, + "step": 561 + }, + { + "epoch": 1.1584769745304861, + "grad_norm": 0.21683354675769806, + "learning_rate": 9.347079037800688e-06, + "loss": 0.259, + "step": 562 + }, + { + "epoch": 1.160535117056856, + "grad_norm": 0.20226940512657166, + "learning_rate": 9.324169530355099e-06, + "loss": 0.2536, + "step": 563 + }, + { + "epoch": 1.1625932595832262, + "grad_norm": 0.2166106402873993, + "learning_rate": 9.301260022909508e-06, + "loss": 0.2573, + "step": 564 + }, + { + "epoch": 1.1646514021095962, + "grad_norm": 0.21802830696105957, + "learning_rate": 9.278350515463918e-06, + "loss": 0.2604, + "step": 565 + }, + { + "epoch": 1.166709544635966, + "grad_norm": 0.19723279774188995, + "learning_rate": 9.255441008018329e-06, + "loss": 0.2643, + "step": 566 + }, + { + "epoch": 1.168767687162336, + "grad_norm": 0.20100893080234528, + "learning_rate": 9.23253150057274e-06, + "loss": 0.2601, + "step": 567 + }, + { + "epoch": 1.170825829688706, + "grad_norm": 0.19834032654762268, + "learning_rate": 9.209621993127148e-06, + "loss": 0.2624, + "step": 568 + }, + { + "epoch": 1.172883972215076, + "grad_norm": 0.20677493512630463, + "learning_rate": 9.186712485681557e-06, + "loss": 0.2527, + "step": 569 + }, + { + "epoch": 1.1749421147414458, + "grad_norm": 0.20895297825336456, + "learning_rate": 9.163802978235968e-06, + "loss": 0.2519, + "step": 570 + }, + { + "epoch": 1.1770002572678158, + "grad_norm": 0.19748030602931976, + "learning_rate": 9.140893470790379e-06, + "loss": 0.2567, + "step": 571 + }, + { + "epoch": 1.1790583997941857, + "grad_norm": 0.20713521540164948, + "learning_rate": 9.117983963344789e-06, + "loss": 0.2771, + "step": 572 + }, + { + "epoch": 1.1811165423205556, + "grad_norm": 0.2146754264831543, + "learning_rate": 9.095074455899198e-06, + "loss": 0.2537, + "step": 573 + }, + { + "epoch": 1.1831746848469256, + "grad_norm": 0.20723004639148712, + "learning_rate": 9.072164948453609e-06, + "loss": 0.253, + "step": 574 + }, + { + "epoch": 1.1852328273732957, + "grad_norm": 0.2072172611951828, + "learning_rate": 9.04925544100802e-06, + "loss": 0.2545, + "step": 575 + }, + { + "epoch": 1.1872909698996654, + "grad_norm": 0.20537281036376953, + "learning_rate": 9.02634593356243e-06, + "loss": 0.2517, + "step": 576 + }, + { + "epoch": 1.1893491124260356, + "grad_norm": 0.21034401655197144, + "learning_rate": 9.003436426116839e-06, + "loss": 0.2506, + "step": 577 + }, + { + "epoch": 1.1914072549524055, + "grad_norm": 0.21373845636844635, + "learning_rate": 8.98052691867125e-06, + "loss": 0.2544, + "step": 578 + }, + { + "epoch": 1.1934653974787754, + "grad_norm": 0.22282572090625763, + "learning_rate": 8.95761741122566e-06, + "loss": 0.2607, + "step": 579 + }, + { + "epoch": 1.1955235400051454, + "grad_norm": 0.20421402156352997, + "learning_rate": 8.93470790378007e-06, + "loss": 0.2636, + "step": 580 + }, + { + "epoch": 1.1975816825315153, + "grad_norm": 0.2095903605222702, + "learning_rate": 8.91179839633448e-06, + "loss": 0.2627, + "step": 581 + }, + { + "epoch": 1.1996398250578852, + "grad_norm": 0.2215132862329483, + "learning_rate": 8.888888888888888e-06, + "loss": 0.2651, + "step": 582 + }, + { + "epoch": 1.2016979675842552, + "grad_norm": 0.22536343336105347, + "learning_rate": 8.865979381443299e-06, + "loss": 0.2548, + "step": 583 + }, + { + "epoch": 1.203756110110625, + "grad_norm": 0.19969668984413147, + "learning_rate": 8.84306987399771e-06, + "loss": 0.2646, + "step": 584 + }, + { + "epoch": 1.205814252636995, + "grad_norm": 0.225993350148201, + "learning_rate": 8.82016036655212e-06, + "loss": 0.2607, + "step": 585 + }, + { + "epoch": 1.207872395163365, + "grad_norm": 0.19197311997413635, + "learning_rate": 8.797250859106529e-06, + "loss": 0.2519, + "step": 586 + }, + { + "epoch": 1.209930537689735, + "grad_norm": 0.1974429190158844, + "learning_rate": 8.77434135166094e-06, + "loss": 0.2512, + "step": 587 + }, + { + "epoch": 1.211988680216105, + "grad_norm": 0.19816122949123383, + "learning_rate": 8.75143184421535e-06, + "loss": 0.2582, + "step": 588 + }, + { + "epoch": 1.214046822742475, + "grad_norm": 0.20259711146354675, + "learning_rate": 8.72852233676976e-06, + "loss": 0.2561, + "step": 589 + }, + { + "epoch": 1.216104965268845, + "grad_norm": 0.23857274651527405, + "learning_rate": 8.70561282932417e-06, + "loss": 0.2574, + "step": 590 + }, + { + "epoch": 1.2181631077952149, + "grad_norm": 0.2108597606420517, + "learning_rate": 8.68270332187858e-06, + "loss": 0.2546, + "step": 591 + }, + { + "epoch": 1.2202212503215848, + "grad_norm": 0.20933857560157776, + "learning_rate": 8.65979381443299e-06, + "loss": 0.2527, + "step": 592 + }, + { + "epoch": 1.2222793928479547, + "grad_norm": 0.19276075065135956, + "learning_rate": 8.636884306987401e-06, + "loss": 0.26, + "step": 593 + }, + { + "epoch": 1.2243375353743247, + "grad_norm": 0.2111658900976181, + "learning_rate": 8.61397479954181e-06, + "loss": 0.267, + "step": 594 + }, + { + "epoch": 1.2263956779006946, + "grad_norm": 0.20039953291416168, + "learning_rate": 8.591065292096221e-06, + "loss": 0.2454, + "step": 595 + }, + { + "epoch": 1.2284538204270645, + "grad_norm": 0.212934210896492, + "learning_rate": 8.56815578465063e-06, + "loss": 0.2674, + "step": 596 + }, + { + "epoch": 1.2305119629534345, + "grad_norm": 0.2036072462797165, + "learning_rate": 8.54524627720504e-06, + "loss": 0.2613, + "step": 597 + }, + { + "epoch": 1.2325701054798044, + "grad_norm": 0.20735019445419312, + "learning_rate": 8.522336769759451e-06, + "loss": 0.2648, + "step": 598 + }, + { + "epoch": 1.2346282480061745, + "grad_norm": 0.2097824215888977, + "learning_rate": 8.49942726231386e-06, + "loss": 0.2535, + "step": 599 + }, + { + "epoch": 1.2366863905325443, + "grad_norm": 0.19988034665584564, + "learning_rate": 8.47651775486827e-06, + "loss": 0.2507, + "step": 600 + }, + { + "epoch": 1.2366863905325443, + "eval_loss": 0.28046268224716187, + "eval_runtime": 2441.2385, + "eval_samples_per_second": 3.184, + "eval_steps_per_second": 0.796, + "step": 600 + }, + { + "epoch": 1.2387445330589144, + "grad_norm": 0.20321473479270935, + "learning_rate": 8.453608247422681e-06, + "loss": 0.2588, + "step": 601 + }, + { + "epoch": 1.2408026755852843, + "grad_norm": 0.20362116396427155, + "learning_rate": 8.430698739977092e-06, + "loss": 0.2608, + "step": 602 + }, + { + "epoch": 1.2428608181116543, + "grad_norm": 0.20123381912708282, + "learning_rate": 8.4077892325315e-06, + "loss": 0.2527, + "step": 603 + }, + { + "epoch": 1.2449189606380242, + "grad_norm": 0.2133895605802536, + "learning_rate": 8.384879725085911e-06, + "loss": 0.2731, + "step": 604 + }, + { + "epoch": 1.2469771031643941, + "grad_norm": 0.5265193581581116, + "learning_rate": 8.361970217640322e-06, + "loss": 0.2498, + "step": 605 + }, + { + "epoch": 1.249035245690764, + "grad_norm": 0.2142847776412964, + "learning_rate": 8.339060710194732e-06, + "loss": 0.268, + "step": 606 + }, + { + "epoch": 1.251093388217134, + "grad_norm": 0.19556185603141785, + "learning_rate": 8.316151202749141e-06, + "loss": 0.2587, + "step": 607 + }, + { + "epoch": 1.253151530743504, + "grad_norm": 0.20104384422302246, + "learning_rate": 8.293241695303552e-06, + "loss": 0.248, + "step": 608 + }, + { + "epoch": 1.2552096732698739, + "grad_norm": 0.20386339724063873, + "learning_rate": 8.27033218785796e-06, + "loss": 0.2564, + "step": 609 + }, + { + "epoch": 1.257267815796244, + "grad_norm": 0.21464361250400543, + "learning_rate": 8.247422680412371e-06, + "loss": 0.2651, + "step": 610 + }, + { + "epoch": 1.2593259583226137, + "grad_norm": 0.20295380055904388, + "learning_rate": 8.224513172966782e-06, + "loss": 0.249, + "step": 611 + }, + { + "epoch": 1.261384100848984, + "grad_norm": 0.19431617856025696, + "learning_rate": 8.201603665521193e-06, + "loss": 0.2487, + "step": 612 + }, + { + "epoch": 1.2634422433753538, + "grad_norm": 0.20218072831630707, + "learning_rate": 8.178694158075601e-06, + "loss": 0.2609, + "step": 613 + }, + { + "epoch": 1.2655003859017238, + "grad_norm": 0.20500090718269348, + "learning_rate": 8.155784650630012e-06, + "loss": 0.2705, + "step": 614 + }, + { + "epoch": 1.2675585284280937, + "grad_norm": 0.20803052186965942, + "learning_rate": 8.132875143184423e-06, + "loss": 0.2525, + "step": 615 + }, + { + "epoch": 1.2696166709544636, + "grad_norm": 0.2087874561548233, + "learning_rate": 8.109965635738832e-06, + "loss": 0.2541, + "step": 616 + }, + { + "epoch": 1.2716748134808336, + "grad_norm": 0.2055324912071228, + "learning_rate": 8.087056128293242e-06, + "loss": 0.2647, + "step": 617 + }, + { + "epoch": 1.2737329560072035, + "grad_norm": 0.20352068543434143, + "learning_rate": 8.064146620847653e-06, + "loss": 0.2666, + "step": 618 + }, + { + "epoch": 1.2757910985335734, + "grad_norm": 0.20651914179325104, + "learning_rate": 8.041237113402063e-06, + "loss": 0.2525, + "step": 619 + }, + { + "epoch": 1.2778492410599434, + "grad_norm": 0.2097817212343216, + "learning_rate": 8.018327605956472e-06, + "loss": 0.2576, + "step": 620 + }, + { + "epoch": 1.2799073835863133, + "grad_norm": 0.20695503056049347, + "learning_rate": 7.995418098510883e-06, + "loss": 0.2633, + "step": 621 + }, + { + "epoch": 1.2819655261126832, + "grad_norm": 0.20550110936164856, + "learning_rate": 7.972508591065293e-06, + "loss": 0.2629, + "step": 622 + }, + { + "epoch": 1.2840236686390534, + "grad_norm": 0.2035083919763565, + "learning_rate": 7.949599083619702e-06, + "loss": 0.2566, + "step": 623 + }, + { + "epoch": 1.286081811165423, + "grad_norm": 0.21426044404506683, + "learning_rate": 7.926689576174113e-06, + "loss": 0.2636, + "step": 624 + }, + { + "epoch": 1.2881399536917932, + "grad_norm": 0.20519520342350006, + "learning_rate": 7.903780068728523e-06, + "loss": 0.2665, + "step": 625 + }, + { + "epoch": 1.2901980962181632, + "grad_norm": 0.2012549638748169, + "learning_rate": 7.880870561282932e-06, + "loss": 0.2588, + "step": 626 + }, + { + "epoch": 1.292256238744533, + "grad_norm": 0.19951675832271576, + "learning_rate": 7.857961053837343e-06, + "loss": 0.2592, + "step": 627 + }, + { + "epoch": 1.294314381270903, + "grad_norm": 0.21163856983184814, + "learning_rate": 7.835051546391754e-06, + "loss": 0.26, + "step": 628 + }, + { + "epoch": 1.296372523797273, + "grad_norm": 0.21543577313423157, + "learning_rate": 7.812142038946164e-06, + "loss": 0.2486, + "step": 629 + }, + { + "epoch": 1.298430666323643, + "grad_norm": 0.20984649658203125, + "learning_rate": 7.789232531500573e-06, + "loss": 0.2603, + "step": 630 + }, + { + "epoch": 1.3004888088500128, + "grad_norm": 0.20047229528427124, + "learning_rate": 7.766323024054984e-06, + "loss": 0.2559, + "step": 631 + }, + { + "epoch": 1.3025469513763828, + "grad_norm": 0.21747010946273804, + "learning_rate": 7.743413516609394e-06, + "loss": 0.2563, + "step": 632 + }, + { + "epoch": 1.3046050939027527, + "grad_norm": 0.20818108320236206, + "learning_rate": 7.720504009163803e-06, + "loss": 0.2507, + "step": 633 + }, + { + "epoch": 1.3066632364291229, + "grad_norm": 0.19827309250831604, + "learning_rate": 7.697594501718214e-06, + "loss": 0.2578, + "step": 634 + }, + { + "epoch": 1.3087213789554926, + "grad_norm": 0.2122543305158615, + "learning_rate": 7.674684994272624e-06, + "loss": 0.2633, + "step": 635 + }, + { + "epoch": 1.3107795214818627, + "grad_norm": 0.20870576798915863, + "learning_rate": 7.651775486827033e-06, + "loss": 0.2616, + "step": 636 + }, + { + "epoch": 1.3128376640082327, + "grad_norm": 0.2069362998008728, + "learning_rate": 7.628865979381444e-06, + "loss": 0.2426, + "step": 637 + }, + { + "epoch": 1.3148958065346026, + "grad_norm": 0.19999894499778748, + "learning_rate": 7.6059564719358535e-06, + "loss": 0.2547, + "step": 638 + }, + { + "epoch": 1.3169539490609725, + "grad_norm": 0.20518334209918976, + "learning_rate": 7.583046964490264e-06, + "loss": 0.2571, + "step": 639 + }, + { + "epoch": 1.3190120915873425, + "grad_norm": 0.20558986067771912, + "learning_rate": 7.560137457044674e-06, + "loss": 0.2483, + "step": 640 + }, + { + "epoch": 1.3210702341137124, + "grad_norm": 0.21443884074687958, + "learning_rate": 7.5372279495990845e-06, + "loss": 0.2494, + "step": 641 + }, + { + "epoch": 1.3231283766400823, + "grad_norm": 0.2025483101606369, + "learning_rate": 7.514318442153494e-06, + "loss": 0.2473, + "step": 642 + }, + { + "epoch": 1.3251865191664522, + "grad_norm": 0.21094976365566254, + "learning_rate": 7.491408934707905e-06, + "loss": 0.2603, + "step": 643 + }, + { + "epoch": 1.3272446616928222, + "grad_norm": 0.2047881782054901, + "learning_rate": 7.4684994272623145e-06, + "loss": 0.2601, + "step": 644 + }, + { + "epoch": 1.3293028042191921, + "grad_norm": 0.2075866013765335, + "learning_rate": 7.445589919816725e-06, + "loss": 0.2644, + "step": 645 + }, + { + "epoch": 1.331360946745562, + "grad_norm": 0.2174414098262787, + "learning_rate": 7.422680412371135e-06, + "loss": 0.2609, + "step": 646 + }, + { + "epoch": 1.3334190892719322, + "grad_norm": 0.20820266008377075, + "learning_rate": 7.3997709049255455e-06, + "loss": 0.2535, + "step": 647 + }, + { + "epoch": 1.335477231798302, + "grad_norm": 0.20941515266895294, + "learning_rate": 7.376861397479955e-06, + "loss": 0.2578, + "step": 648 + }, + { + "epoch": 1.337535374324672, + "grad_norm": 0.2027975171804428, + "learning_rate": 7.353951890034365e-06, + "loss": 0.2573, + "step": 649 + }, + { + "epoch": 1.339593516851042, + "grad_norm": 0.209550142288208, + "learning_rate": 7.331042382588775e-06, + "loss": 0.2513, + "step": 650 + }, + { + "epoch": 1.341651659377412, + "grad_norm": 0.21425557136535645, + "learning_rate": 7.3081328751431845e-06, + "loss": 0.2568, + "step": 651 + }, + { + "epoch": 1.3437098019037819, + "grad_norm": 0.22760476171970367, + "learning_rate": 7.285223367697595e-06, + "loss": 0.2549, + "step": 652 + }, + { + "epoch": 1.3457679444301518, + "grad_norm": 0.21329441666603088, + "learning_rate": 7.262313860252005e-06, + "loss": 0.2467, + "step": 653 + }, + { + "epoch": 1.3478260869565217, + "grad_norm": 0.20949490368366241, + "learning_rate": 7.239404352806415e-06, + "loss": 0.2569, + "step": 654 + }, + { + "epoch": 1.3498842294828917, + "grad_norm": 0.21022753417491913, + "learning_rate": 7.216494845360825e-06, + "loss": 0.2644, + "step": 655 + }, + { + "epoch": 1.3519423720092616, + "grad_norm": 0.20240676403045654, + "learning_rate": 7.193585337915236e-06, + "loss": 0.2561, + "step": 656 + }, + { + "epoch": 1.3540005145356315, + "grad_norm": 0.19892892241477966, + "learning_rate": 7.1706758304696455e-06, + "loss": 0.2564, + "step": 657 + }, + { + "epoch": 1.3560586570620017, + "grad_norm": 0.22104541957378387, + "learning_rate": 7.147766323024056e-06, + "loss": 0.2466, + "step": 658 + }, + { + "epoch": 1.3581167995883714, + "grad_norm": 0.2074560970067978, + "learning_rate": 7.124856815578466e-06, + "loss": 0.2634, + "step": 659 + }, + { + "epoch": 1.3601749421147415, + "grad_norm": 0.20596396923065186, + "learning_rate": 7.101947308132876e-06, + "loss": 0.2566, + "step": 660 + }, + { + "epoch": 1.3622330846411115, + "grad_norm": 0.2072969526052475, + "learning_rate": 7.079037800687286e-06, + "loss": 0.2603, + "step": 661 + }, + { + "epoch": 1.3642912271674814, + "grad_norm": 0.21680790185928345, + "learning_rate": 7.056128293241697e-06, + "loss": 0.2536, + "step": 662 + }, + { + "epoch": 1.3663493696938513, + "grad_norm": 0.2035921961069107, + "learning_rate": 7.0332187857961065e-06, + "loss": 0.2567, + "step": 663 + }, + { + "epoch": 1.3684075122202213, + "grad_norm": 0.21186605095863342, + "learning_rate": 7.010309278350515e-06, + "loss": 0.2575, + "step": 664 + }, + { + "epoch": 1.3704656547465912, + "grad_norm": 0.21388404071331024, + "learning_rate": 6.987399770904926e-06, + "loss": 0.2522, + "step": 665 + }, + { + "epoch": 1.3725237972729611, + "grad_norm": 0.21118783950805664, + "learning_rate": 6.964490263459336e-06, + "loss": 0.25, + "step": 666 + }, + { + "epoch": 1.374581939799331, + "grad_norm": 0.21162322163581848, + "learning_rate": 6.941580756013746e-06, + "loss": 0.253, + "step": 667 + }, + { + "epoch": 1.376640082325701, + "grad_norm": 0.21186329424381256, + "learning_rate": 6.918671248568156e-06, + "loss": 0.2589, + "step": 668 + }, + { + "epoch": 1.378698224852071, + "grad_norm": 0.21206888556480408, + "learning_rate": 6.895761741122567e-06, + "loss": 0.2629, + "step": 669 + }, + { + "epoch": 1.3807563673784409, + "grad_norm": 0.21045179665088654, + "learning_rate": 6.872852233676976e-06, + "loss": 0.2523, + "step": 670 + }, + { + "epoch": 1.382814509904811, + "grad_norm": 0.21106329560279846, + "learning_rate": 6.849942726231387e-06, + "loss": 0.2611, + "step": 671 + }, + { + "epoch": 1.3848726524311807, + "grad_norm": 0.20593757927417755, + "learning_rate": 6.827033218785797e-06, + "loss": 0.2537, + "step": 672 + }, + { + "epoch": 1.386930794957551, + "grad_norm": 0.2040368914604187, + "learning_rate": 6.804123711340207e-06, + "loss": 0.2545, + "step": 673 + }, + { + "epoch": 1.3889889374839208, + "grad_norm": 0.2148980051279068, + "learning_rate": 6.781214203894617e-06, + "loss": 0.264, + "step": 674 + }, + { + "epoch": 1.3910470800102908, + "grad_norm": 0.204456627368927, + "learning_rate": 6.758304696449028e-06, + "loss": 0.2609, + "step": 675 + }, + { + "epoch": 1.3931052225366607, + "grad_norm": 0.20230846107006073, + "learning_rate": 6.735395189003437e-06, + "loss": 0.2644, + "step": 676 + }, + { + "epoch": 1.3951633650630306, + "grad_norm": 0.205158531665802, + "learning_rate": 6.712485681557846e-06, + "loss": 0.2611, + "step": 677 + }, + { + "epoch": 1.3972215075894006, + "grad_norm": 0.21487553417682648, + "learning_rate": 6.689576174112257e-06, + "loss": 0.2492, + "step": 678 + }, + { + "epoch": 1.3992796501157705, + "grad_norm": 0.21277402341365814, + "learning_rate": 6.666666666666667e-06, + "loss": 0.2491, + "step": 679 + }, + { + "epoch": 1.4013377926421404, + "grad_norm": 0.2049219310283661, + "learning_rate": 6.643757159221077e-06, + "loss": 0.2444, + "step": 680 + }, + { + "epoch": 1.4033959351685104, + "grad_norm": 0.23122920095920563, + "learning_rate": 6.620847651775487e-06, + "loss": 0.2522, + "step": 681 + }, + { + "epoch": 1.4054540776948803, + "grad_norm": 0.2067662477493286, + "learning_rate": 6.597938144329898e-06, + "loss": 0.2583, + "step": 682 + }, + { + "epoch": 1.4075122202212502, + "grad_norm": 0.2043958306312561, + "learning_rate": 6.575028636884307e-06, + "loss": 0.2603, + "step": 683 + }, + { + "epoch": 1.4095703627476204, + "grad_norm": 0.21982067823410034, + "learning_rate": 6.552119129438718e-06, + "loss": 0.246, + "step": 684 + }, + { + "epoch": 1.41162850527399, + "grad_norm": 0.21510522067546844, + "learning_rate": 6.529209621993128e-06, + "loss": 0.2554, + "step": 685 + }, + { + "epoch": 1.4136866478003602, + "grad_norm": 0.24448052048683167, + "learning_rate": 6.506300114547538e-06, + "loss": 0.256, + "step": 686 + }, + { + "epoch": 1.4157447903267302, + "grad_norm": 0.2068399339914322, + "learning_rate": 6.483390607101948e-06, + "loss": 0.2566, + "step": 687 + }, + { + "epoch": 1.4178029328531, + "grad_norm": 0.20870736241340637, + "learning_rate": 6.460481099656359e-06, + "loss": 0.2493, + "step": 688 + }, + { + "epoch": 1.41986107537947, + "grad_norm": 0.22065278887748718, + "learning_rate": 6.437571592210768e-06, + "loss": 0.2566, + "step": 689 + }, + { + "epoch": 1.42191921790584, + "grad_norm": 0.21523869037628174, + "learning_rate": 6.414662084765179e-06, + "loss": 0.2579, + "step": 690 + }, + { + "epoch": 1.42397736043221, + "grad_norm": 0.21578392386436462, + "learning_rate": 6.391752577319588e-06, + "loss": 0.2555, + "step": 691 + }, + { + "epoch": 1.4260355029585798, + "grad_norm": 0.2096480280160904, + "learning_rate": 6.3688430698739976e-06, + "loss": 0.2534, + "step": 692 + }, + { + "epoch": 1.4280936454849498, + "grad_norm": 0.21274186670780182, + "learning_rate": 6.345933562428408e-06, + "loss": 0.2521, + "step": 693 + }, + { + "epoch": 1.4301517880113197, + "grad_norm": 0.21426336467266083, + "learning_rate": 6.323024054982818e-06, + "loss": 0.2589, + "step": 694 + }, + { + "epoch": 1.4322099305376899, + "grad_norm": 0.21294309198856354, + "learning_rate": 6.3001145475372285e-06, + "loss": 0.2615, + "step": 695 + }, + { + "epoch": 1.4342680730640596, + "grad_norm": 0.2021908164024353, + "learning_rate": 6.277205040091638e-06, + "loss": 0.2714, + "step": 696 + }, + { + "epoch": 1.4363262155904297, + "grad_norm": 0.21605439484119415, + "learning_rate": 6.254295532646049e-06, + "loss": 0.2592, + "step": 697 + }, + { + "epoch": 1.4383843581167997, + "grad_norm": 0.2154022753238678, + "learning_rate": 6.231386025200459e-06, + "loss": 0.2633, + "step": 698 + }, + { + "epoch": 1.4404425006431696, + "grad_norm": 0.2178344875574112, + "learning_rate": 6.208476517754869e-06, + "loss": 0.2685, + "step": 699 + }, + { + "epoch": 1.4425006431695395, + "grad_norm": 0.21423941850662231, + "learning_rate": 6.185567010309279e-06, + "loss": 0.2474, + "step": 700 + }, + { + "epoch": 1.4425006431695395, + "eval_loss": 0.27773216366767883, + "eval_runtime": 2423.2314, + "eval_samples_per_second": 3.208, + "eval_steps_per_second": 0.802, + "step": 700 + }, + { + "epoch": 1.4445587856959095, + "grad_norm": 0.19836685061454773, + "learning_rate": 6.1626575028636895e-06, + "loss": 0.2556, + "step": 701 + }, + { + "epoch": 1.4466169282222794, + "grad_norm": 0.21015697717666626, + "learning_rate": 6.139747995418099e-06, + "loss": 0.2605, + "step": 702 + }, + { + "epoch": 1.4486750707486493, + "grad_norm": 0.2158636897802353, + "learning_rate": 6.11683848797251e-06, + "loss": 0.252, + "step": 703 + }, + { + "epoch": 1.4507332132750193, + "grad_norm": 0.2136162966489792, + "learning_rate": 6.09392898052692e-06, + "loss": 0.2451, + "step": 704 + }, + { + "epoch": 1.4527913558013892, + "grad_norm": 0.21352505683898926, + "learning_rate": 6.0710194730813285e-06, + "loss": 0.2649, + "step": 705 + }, + { + "epoch": 1.4548494983277591, + "grad_norm": 0.22503146529197693, + "learning_rate": 6.048109965635739e-06, + "loss": 0.2604, + "step": 706 + }, + { + "epoch": 1.456907640854129, + "grad_norm": 0.2114841490983963, + "learning_rate": 6.025200458190149e-06, + "loss": 0.2547, + "step": 707 + }, + { + "epoch": 1.4589657833804992, + "grad_norm": 0.22603987157344818, + "learning_rate": 6.0022909507445594e-06, + "loss": 0.2551, + "step": 708 + }, + { + "epoch": 1.461023925906869, + "grad_norm": 0.2188458889722824, + "learning_rate": 5.979381443298969e-06, + "loss": 0.2553, + "step": 709 + }, + { + "epoch": 1.463082068433239, + "grad_norm": 0.21128129959106445, + "learning_rate": 5.95647193585338e-06, + "loss": 0.258, + "step": 710 + }, + { + "epoch": 1.465140210959609, + "grad_norm": 0.22289037704467773, + "learning_rate": 5.9335624284077895e-06, + "loss": 0.2709, + "step": 711 + }, + { + "epoch": 1.467198353485979, + "grad_norm": 0.21750517189502716, + "learning_rate": 5.9106529209622e-06, + "loss": 0.2597, + "step": 712 + }, + { + "epoch": 1.4692564960123489, + "grad_norm": 0.21022778749465942, + "learning_rate": 5.88774341351661e-06, + "loss": 0.2533, + "step": 713 + }, + { + "epoch": 1.4713146385387188, + "grad_norm": 0.21544480323791504, + "learning_rate": 5.8648339060710204e-06, + "loss": 0.255, + "step": 714 + }, + { + "epoch": 1.4733727810650887, + "grad_norm": 0.20856665074825287, + "learning_rate": 5.84192439862543e-06, + "loss": 0.2648, + "step": 715 + }, + { + "epoch": 1.4754309235914587, + "grad_norm": 0.2105010449886322, + "learning_rate": 5.819014891179841e-06, + "loss": 0.2611, + "step": 716 + }, + { + "epoch": 1.4774890661178286, + "grad_norm": 0.21749204397201538, + "learning_rate": 5.7961053837342505e-06, + "loss": 0.2551, + "step": 717 + }, + { + "epoch": 1.4795472086441985, + "grad_norm": 0.20478859543800354, + "learning_rate": 5.7731958762886594e-06, + "loss": 0.2554, + "step": 718 + }, + { + "epoch": 1.4816053511705687, + "grad_norm": 0.213475301861763, + "learning_rate": 5.75028636884307e-06, + "loss": 0.2622, + "step": 719 + }, + { + "epoch": 1.4836634936969384, + "grad_norm": 0.2008693963289261, + "learning_rate": 5.72737686139748e-06, + "loss": 0.2483, + "step": 720 + }, + { + "epoch": 1.4857216362233086, + "grad_norm": 0.19621135294437408, + "learning_rate": 5.70446735395189e-06, + "loss": 0.2553, + "step": 721 + }, + { + "epoch": 1.4877797787496785, + "grad_norm": 0.227009579539299, + "learning_rate": 5.6815578465063e-06, + "loss": 0.2529, + "step": 722 + }, + { + "epoch": 1.4898379212760484, + "grad_norm": 0.21584804356098175, + "learning_rate": 5.658648339060711e-06, + "loss": 0.2545, + "step": 723 + }, + { + "epoch": 1.4918960638024183, + "grad_norm": 0.2207970768213272, + "learning_rate": 5.6357388316151204e-06, + "loss": 0.2463, + "step": 724 + }, + { + "epoch": 1.4939542063287883, + "grad_norm": 0.22498710453510284, + "learning_rate": 5.612829324169531e-06, + "loss": 0.2593, + "step": 725 + }, + { + "epoch": 1.4960123488551582, + "grad_norm": 0.2146955132484436, + "learning_rate": 5.589919816723941e-06, + "loss": 0.2466, + "step": 726 + }, + { + "epoch": 1.4980704913815281, + "grad_norm": 0.21701963245868683, + "learning_rate": 5.567010309278351e-06, + "loss": 0.2602, + "step": 727 + }, + { + "epoch": 1.500128633907898, + "grad_norm": 0.2154153287410736, + "learning_rate": 5.544100801832761e-06, + "loss": 0.2652, + "step": 728 + }, + { + "epoch": 1.502186776434268, + "grad_norm": 0.2135971337556839, + "learning_rate": 5.521191294387172e-06, + "loss": 0.2465, + "step": 729 + }, + { + "epoch": 1.5042449189606382, + "grad_norm": 0.21887153387069702, + "learning_rate": 5.4982817869415815e-06, + "loss": 0.2553, + "step": 730 + }, + { + "epoch": 1.5063030614870079, + "grad_norm": 0.21986471116542816, + "learning_rate": 5.475372279495992e-06, + "loss": 0.2568, + "step": 731 + }, + { + "epoch": 1.508361204013378, + "grad_norm": 0.2224634885787964, + "learning_rate": 5.452462772050401e-06, + "loss": 0.2609, + "step": 732 + }, + { + "epoch": 1.5104193465397477, + "grad_norm": 0.22347122430801392, + "learning_rate": 5.429553264604811e-06, + "loss": 0.2557, + "step": 733 + }, + { + "epoch": 1.512477489066118, + "grad_norm": 0.21803030371665955, + "learning_rate": 5.406643757159221e-06, + "loss": 0.2622, + "step": 734 + }, + { + "epoch": 1.5145356315924878, + "grad_norm": 0.2078487128019333, + "learning_rate": 5.383734249713631e-06, + "loss": 0.249, + "step": 735 + }, + { + "epoch": 1.5165937741188578, + "grad_norm": 0.20815445482730865, + "learning_rate": 5.360824742268042e-06, + "loss": 0.2538, + "step": 736 + }, + { + "epoch": 1.5186519166452277, + "grad_norm": 0.21298891305923462, + "learning_rate": 5.337915234822451e-06, + "loss": 0.2532, + "step": 737 + }, + { + "epoch": 1.5207100591715976, + "grad_norm": 0.21032264828681946, + "learning_rate": 5.315005727376862e-06, + "loss": 0.2428, + "step": 738 + }, + { + "epoch": 1.5227682016979676, + "grad_norm": 0.23191553354263306, + "learning_rate": 5.292096219931272e-06, + "loss": 0.2545, + "step": 739 + }, + { + "epoch": 1.5248263442243375, + "grad_norm": 0.21168164908885956, + "learning_rate": 5.269186712485682e-06, + "loss": 0.2668, + "step": 740 + }, + { + "epoch": 1.5268844867507076, + "grad_norm": 0.2142658829689026, + "learning_rate": 5.246277205040092e-06, + "loss": 0.2589, + "step": 741 + }, + { + "epoch": 1.5289426292770774, + "grad_norm": 0.2130551040172577, + "learning_rate": 5.223367697594503e-06, + "loss": 0.248, + "step": 742 + }, + { + "epoch": 1.5310007718034475, + "grad_norm": 0.2171664535999298, + "learning_rate": 5.200458190148912e-06, + "loss": 0.2539, + "step": 743 + }, + { + "epoch": 1.5330589143298172, + "grad_norm": 0.21375024318695068, + "learning_rate": 5.177548682703323e-06, + "loss": 0.2471, + "step": 744 + }, + { + "epoch": 1.5351170568561874, + "grad_norm": 0.21037080883979797, + "learning_rate": 5.154639175257732e-06, + "loss": 0.2526, + "step": 745 + }, + { + "epoch": 1.537175199382557, + "grad_norm": 0.2103818953037262, + "learning_rate": 5.131729667812142e-06, + "loss": 0.2609, + "step": 746 + }, + { + "epoch": 1.5392333419089272, + "grad_norm": 0.21307708323001862, + "learning_rate": 5.108820160366552e-06, + "loss": 0.2606, + "step": 747 + }, + { + "epoch": 1.5412914844352972, + "grad_norm": 0.2052801549434662, + "learning_rate": 5.085910652920962e-06, + "loss": 0.2462, + "step": 748 + }, + { + "epoch": 1.543349626961667, + "grad_norm": 0.2059316784143448, + "learning_rate": 5.0630011454753726e-06, + "loss": 0.2593, + "step": 749 + }, + { + "epoch": 1.545407769488037, + "grad_norm": 0.211748406291008, + "learning_rate": 5.040091638029782e-06, + "loss": 0.2582, + "step": 750 + }, + { + "epoch": 1.547465912014407, + "grad_norm": 0.20883141458034515, + "learning_rate": 5.017182130584193e-06, + "loss": 0.251, + "step": 751 + }, + { + "epoch": 1.549524054540777, + "grad_norm": 0.21496839821338654, + "learning_rate": 4.994272623138603e-06, + "loss": 0.2486, + "step": 752 + }, + { + "epoch": 1.5515821970671468, + "grad_norm": 0.21443761885166168, + "learning_rate": 4.971363115693013e-06, + "loss": 0.2541, + "step": 753 + }, + { + "epoch": 1.553640339593517, + "grad_norm": 0.2164083868265152, + "learning_rate": 4.948453608247423e-06, + "loss": 0.2515, + "step": 754 + }, + { + "epoch": 1.5556984821198867, + "grad_norm": 0.22733120620250702, + "learning_rate": 4.9255441008018336e-06, + "loss": 0.2674, + "step": 755 + }, + { + "epoch": 1.5577566246462569, + "grad_norm": 0.21141202747821808, + "learning_rate": 4.902634593356243e-06, + "loss": 0.2586, + "step": 756 + }, + { + "epoch": 1.5598147671726266, + "grad_norm": 0.20612719655036926, + "learning_rate": 4.879725085910653e-06, + "loss": 0.2417, + "step": 757 + }, + { + "epoch": 1.5618729096989967, + "grad_norm": 0.21028929948806763, + "learning_rate": 4.856815578465064e-06, + "loss": 0.2546, + "step": 758 + }, + { + "epoch": 1.5639310522253667, + "grad_norm": 0.2196635901927948, + "learning_rate": 4.833906071019473e-06, + "loss": 0.2527, + "step": 759 + }, + { + "epoch": 1.5659891947517366, + "grad_norm": 0.20016127824783325, + "learning_rate": 4.810996563573884e-06, + "loss": 0.2629, + "step": 760 + }, + { + "epoch": 1.5680473372781065, + "grad_norm": 0.20597878098487854, + "learning_rate": 4.788087056128294e-06, + "loss": 0.2544, + "step": 761 + }, + { + "epoch": 1.5701054798044765, + "grad_norm": 0.20151163637638092, + "learning_rate": 4.7651775486827035e-06, + "loss": 0.2569, + "step": 762 + }, + { + "epoch": 1.5721636223308464, + "grad_norm": 0.21117815375328064, + "learning_rate": 4.742268041237113e-06, + "loss": 0.2602, + "step": 763 + }, + { + "epoch": 1.5742217648572163, + "grad_norm": 0.20184555649757385, + "learning_rate": 4.719358533791524e-06, + "loss": 0.2673, + "step": 764 + }, + { + "epoch": 1.5762799073835865, + "grad_norm": 0.20125100016593933, + "learning_rate": 4.6964490263459336e-06, + "loss": 0.2669, + "step": 765 + }, + { + "epoch": 1.5783380499099562, + "grad_norm": 0.2209872603416443, + "learning_rate": 4.673539518900344e-06, + "loss": 0.2569, + "step": 766 + }, + { + "epoch": 1.5803961924363263, + "grad_norm": 0.21065855026245117, + "learning_rate": 4.650630011454754e-06, + "loss": 0.2477, + "step": 767 + }, + { + "epoch": 1.582454334962696, + "grad_norm": 0.20995444059371948, + "learning_rate": 4.6277205040091645e-06, + "loss": 0.2563, + "step": 768 + }, + { + "epoch": 1.5845124774890662, + "grad_norm": 0.21762295067310333, + "learning_rate": 4.604810996563574e-06, + "loss": 0.2443, + "step": 769 + }, + { + "epoch": 1.586570620015436, + "grad_norm": 0.21741704642772675, + "learning_rate": 4.581901489117984e-06, + "loss": 0.2442, + "step": 770 + }, + { + "epoch": 1.588628762541806, + "grad_norm": 0.21586772799491882, + "learning_rate": 4.5589919816723946e-06, + "loss": 0.2484, + "step": 771 + }, + { + "epoch": 1.590686905068176, + "grad_norm": 0.22184152901172638, + "learning_rate": 4.536082474226804e-06, + "loss": 0.2522, + "step": 772 + }, + { + "epoch": 1.592745047594546, + "grad_norm": 0.22210553288459778, + "learning_rate": 4.513172966781215e-06, + "loss": 0.2552, + "step": 773 + }, + { + "epoch": 1.5948031901209159, + "grad_norm": 0.2075122743844986, + "learning_rate": 4.490263459335625e-06, + "loss": 0.263, + "step": 774 + }, + { + "epoch": 1.5968613326472858, + "grad_norm": 0.20110896229743958, + "learning_rate": 4.467353951890035e-06, + "loss": 0.248, + "step": 775 + }, + { + "epoch": 1.5989194751736557, + "grad_norm": 0.2067912071943283, + "learning_rate": 4.444444444444444e-06, + "loss": 0.2485, + "step": 776 + }, + { + "epoch": 1.6009776177000257, + "grad_norm": 0.2091452181339264, + "learning_rate": 4.421534936998855e-06, + "loss": 0.2783, + "step": 777 + }, + { + "epoch": 1.6030357602263958, + "grad_norm": 0.21414563059806824, + "learning_rate": 4.3986254295532645e-06, + "loss": 0.2502, + "step": 778 + }, + { + "epoch": 1.6050939027527655, + "grad_norm": 0.21657651662826538, + "learning_rate": 4.375715922107675e-06, + "loss": 0.2589, + "step": 779 + }, + { + "epoch": 1.6071520452791357, + "grad_norm": 0.21607093513011932, + "learning_rate": 4.352806414662085e-06, + "loss": 0.2618, + "step": 780 + }, + { + "epoch": 1.6092101878055054, + "grad_norm": 0.21846850216388702, + "learning_rate": 4.329896907216495e-06, + "loss": 0.2549, + "step": 781 + }, + { + "epoch": 1.6112683303318756, + "grad_norm": 0.21873261034488678, + "learning_rate": 4.306987399770905e-06, + "loss": 0.2448, + "step": 782 + }, + { + "epoch": 1.6133264728582455, + "grad_norm": 0.22608645260334015, + "learning_rate": 4.284077892325315e-06, + "loss": 0.2559, + "step": 783 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 0.2121078372001648, + "learning_rate": 4.2611683848797255e-06, + "loss": 0.2515, + "step": 784 + }, + { + "epoch": 1.6174427579109854, + "grad_norm": 0.227590411901474, + "learning_rate": 4.238258877434135e-06, + "loss": 0.2549, + "step": 785 + }, + { + "epoch": 1.6195009004373553, + "grad_norm": 0.201515793800354, + "learning_rate": 4.215349369988546e-06, + "loss": 0.2601, + "step": 786 + }, + { + "epoch": 1.6215590429637252, + "grad_norm": 0.21896880865097046, + "learning_rate": 4.192439862542956e-06, + "loss": 0.2564, + "step": 787 + }, + { + "epoch": 1.6236171854900951, + "grad_norm": 0.21509996056556702, + "learning_rate": 4.169530355097366e-06, + "loss": 0.2491, + "step": 788 + }, + { + "epoch": 1.6256753280164653, + "grad_norm": 0.22020220756530762, + "learning_rate": 4.146620847651776e-06, + "loss": 0.2617, + "step": 789 + }, + { + "epoch": 1.627733470542835, + "grad_norm": 0.21420395374298096, + "learning_rate": 4.123711340206186e-06, + "loss": 0.2561, + "step": 790 + }, + { + "epoch": 1.6297916130692052, + "grad_norm": 0.2270808070898056, + "learning_rate": 4.100801832760596e-06, + "loss": 0.2621, + "step": 791 + }, + { + "epoch": 1.6318497555955749, + "grad_norm": 0.2320822924375534, + "learning_rate": 4.077892325315006e-06, + "loss": 0.269, + "step": 792 + }, + { + "epoch": 1.633907898121945, + "grad_norm": 0.21081334352493286, + "learning_rate": 4.054982817869416e-06, + "loss": 0.2468, + "step": 793 + }, + { + "epoch": 1.6359660406483147, + "grad_norm": 0.2204331010580063, + "learning_rate": 4.032073310423826e-06, + "loss": 0.2578, + "step": 794 + }, + { + "epoch": 1.638024183174685, + "grad_norm": 0.20907023549079895, + "learning_rate": 4.009163802978236e-06, + "loss": 0.2523, + "step": 795 + }, + { + "epoch": 1.6400823257010548, + "grad_norm": 0.23108816146850586, + "learning_rate": 3.986254295532647e-06, + "loss": 0.2619, + "step": 796 + }, + { + "epoch": 1.6421404682274248, + "grad_norm": 0.20781853795051575, + "learning_rate": 3.9633447880870564e-06, + "loss": 0.2496, + "step": 797 + }, + { + "epoch": 1.6441986107537947, + "grad_norm": 0.20635871589183807, + "learning_rate": 3.940435280641466e-06, + "loss": 0.2527, + "step": 798 + }, + { + "epoch": 1.6462567532801646, + "grad_norm": 0.21636071801185608, + "learning_rate": 3.917525773195877e-06, + "loss": 0.2585, + "step": 799 + }, + { + "epoch": 1.6483148958065346, + "grad_norm": 0.20485584437847137, + "learning_rate": 3.8946162657502865e-06, + "loss": 0.259, + "step": 800 + }, + { + "epoch": 1.6483148958065346, + "eval_loss": 0.27598318457603455, + "eval_runtime": 2425.8429, + "eval_samples_per_second": 3.205, + "eval_steps_per_second": 0.801, + "step": 800 + }, + { + "epoch": 1.6503730383329045, + "grad_norm": 0.2005116492509842, + "learning_rate": 3.871706758304697e-06, + "loss": 0.249, + "step": 801 + }, + { + "epoch": 1.6524311808592747, + "grad_norm": 0.20868004858493805, + "learning_rate": 3.848797250859107e-06, + "loss": 0.2502, + "step": 802 + }, + { + "epoch": 1.6544893233856444, + "grad_norm": 0.2084902971982956, + "learning_rate": 3.825887743413517e-06, + "loss": 0.2504, + "step": 803 + }, + { + "epoch": 1.6565474659120145, + "grad_norm": 0.2042844593524933, + "learning_rate": 3.8029782359679268e-06, + "loss": 0.2517, + "step": 804 + }, + { + "epoch": 1.6586056084383842, + "grad_norm": 0.2120312601327896, + "learning_rate": 3.780068728522337e-06, + "loss": 0.2466, + "step": 805 + }, + { + "epoch": 1.6606637509647544, + "grad_norm": 0.21600568294525146, + "learning_rate": 3.757159221076747e-06, + "loss": 0.2521, + "step": 806 + }, + { + "epoch": 1.6627218934911243, + "grad_norm": 0.22209151089191437, + "learning_rate": 3.7342497136311573e-06, + "loss": 0.2662, + "step": 807 + }, + { + "epoch": 1.6647800360174942, + "grad_norm": 0.22267431020736694, + "learning_rate": 3.7113402061855674e-06, + "loss": 0.2498, + "step": 808 + }, + { + "epoch": 1.6668381785438642, + "grad_norm": 0.21584516763687134, + "learning_rate": 3.6884306987399776e-06, + "loss": 0.2528, + "step": 809 + }, + { + "epoch": 1.6688963210702341, + "grad_norm": 0.20465044677257538, + "learning_rate": 3.6655211912943874e-06, + "loss": 0.2569, + "step": 810 + }, + { + "epoch": 1.670954463596604, + "grad_norm": 0.21515893936157227, + "learning_rate": 3.6426116838487975e-06, + "loss": 0.2466, + "step": 811 + }, + { + "epoch": 1.673012606122974, + "grad_norm": 0.22349058091640472, + "learning_rate": 3.6197021764032077e-06, + "loss": 0.2438, + "step": 812 + }, + { + "epoch": 1.6750707486493441, + "grad_norm": 0.23108039796352386, + "learning_rate": 3.596792668957618e-06, + "loss": 0.2461, + "step": 813 + }, + { + "epoch": 1.6771288911757138, + "grad_norm": 0.22195585072040558, + "learning_rate": 3.573883161512028e-06, + "loss": 0.2496, + "step": 814 + }, + { + "epoch": 1.679187033702084, + "grad_norm": 0.22752366960048676, + "learning_rate": 3.550973654066438e-06, + "loss": 0.2495, + "step": 815 + }, + { + "epoch": 1.6812451762284537, + "grad_norm": 0.21112024784088135, + "learning_rate": 3.5280641466208484e-06, + "loss": 0.2453, + "step": 816 + }, + { + "epoch": 1.6833033187548239, + "grad_norm": 0.21209532022476196, + "learning_rate": 3.5051546391752577e-06, + "loss": 0.2466, + "step": 817 + }, + { + "epoch": 1.6853614612811936, + "grad_norm": 0.21724505722522736, + "learning_rate": 3.482245131729668e-06, + "loss": 0.2449, + "step": 818 + }, + { + "epoch": 1.6874196038075637, + "grad_norm": 0.22240252792835236, + "learning_rate": 3.459335624284078e-06, + "loss": 0.2641, + "step": 819 + }, + { + "epoch": 1.6894777463339337, + "grad_norm": 0.217677503824234, + "learning_rate": 3.436426116838488e-06, + "loss": 0.2515, + "step": 820 + }, + { + "epoch": 1.6915358888603036, + "grad_norm": 0.2246546596288681, + "learning_rate": 3.4135166093928984e-06, + "loss": 0.2515, + "step": 821 + }, + { + "epoch": 1.6935940313866735, + "grad_norm": 0.20842307806015015, + "learning_rate": 3.3906071019473085e-06, + "loss": 0.2584, + "step": 822 + }, + { + "epoch": 1.6956521739130435, + "grad_norm": 0.21404647827148438, + "learning_rate": 3.3676975945017187e-06, + "loss": 0.2465, + "step": 823 + }, + { + "epoch": 1.6977103164394134, + "grad_norm": 0.21396222710609436, + "learning_rate": 3.3447880870561285e-06, + "loss": 0.2524, + "step": 824 + }, + { + "epoch": 1.6997684589657833, + "grad_norm": 0.21428625285625458, + "learning_rate": 3.3218785796105386e-06, + "loss": 0.2554, + "step": 825 + }, + { + "epoch": 1.7018266014921535, + "grad_norm": 0.21156470477581024, + "learning_rate": 3.298969072164949e-06, + "loss": 0.2427, + "step": 826 + }, + { + "epoch": 1.7038847440185232, + "grad_norm": 0.21247607469558716, + "learning_rate": 3.276059564719359e-06, + "loss": 0.2398, + "step": 827 + }, + { + "epoch": 1.7059428865448933, + "grad_norm": 0.2125396430492401, + "learning_rate": 3.253150057273769e-06, + "loss": 0.263, + "step": 828 + }, + { + "epoch": 1.708001029071263, + "grad_norm": 0.2217744141817093, + "learning_rate": 3.2302405498281793e-06, + "loss": 0.2447, + "step": 829 + }, + { + "epoch": 1.7100591715976332, + "grad_norm": 0.21454688906669617, + "learning_rate": 3.2073310423825895e-06, + "loss": 0.251, + "step": 830 + }, + { + "epoch": 1.7121173141240031, + "grad_norm": 0.20734067261219025, + "learning_rate": 3.1844215349369988e-06, + "loss": 0.2463, + "step": 831 + }, + { + "epoch": 1.714175456650373, + "grad_norm": 0.21725836396217346, + "learning_rate": 3.161512027491409e-06, + "loss": 0.2519, + "step": 832 + }, + { + "epoch": 1.716233599176743, + "grad_norm": 0.21743294596672058, + "learning_rate": 3.138602520045819e-06, + "loss": 0.2525, + "step": 833 + }, + { + "epoch": 1.718291741703113, + "grad_norm": 0.204753115773201, + "learning_rate": 3.1156930126002293e-06, + "loss": 0.2608, + "step": 834 + }, + { + "epoch": 1.7203498842294829, + "grad_norm": 0.2242167741060257, + "learning_rate": 3.0927835051546395e-06, + "loss": 0.2567, + "step": 835 + }, + { + "epoch": 1.7224080267558528, + "grad_norm": 0.21592582762241364, + "learning_rate": 3.0698739977090496e-06, + "loss": 0.2518, + "step": 836 + }, + { + "epoch": 1.724466169282223, + "grad_norm": 0.2153058797121048, + "learning_rate": 3.04696449026346e-06, + "loss": 0.2517, + "step": 837 + }, + { + "epoch": 1.7265243118085927, + "grad_norm": 0.20874065160751343, + "learning_rate": 3.0240549828178695e-06, + "loss": 0.2567, + "step": 838 + }, + { + "epoch": 1.7285824543349628, + "grad_norm": 0.22043587267398834, + "learning_rate": 3.0011454753722797e-06, + "loss": 0.2587, + "step": 839 + }, + { + "epoch": 1.7306405968613325, + "grad_norm": 0.20169200003147125, + "learning_rate": 2.97823596792669e-06, + "loss": 0.2629, + "step": 840 + }, + { + "epoch": 1.7326987393877027, + "grad_norm": 0.21708932518959045, + "learning_rate": 2.9553264604811e-06, + "loss": 0.2604, + "step": 841 + }, + { + "epoch": 1.7347568819140724, + "grad_norm": 0.21232086420059204, + "learning_rate": 2.9324169530355102e-06, + "loss": 0.2498, + "step": 842 + }, + { + "epoch": 1.7368150244404426, + "grad_norm": 0.20930655300617218, + "learning_rate": 2.9095074455899204e-06, + "loss": 0.2584, + "step": 843 + }, + { + "epoch": 1.7388731669668125, + "grad_norm": 0.207666277885437, + "learning_rate": 2.8865979381443297e-06, + "loss": 0.2414, + "step": 844 + }, + { + "epoch": 1.7409313094931824, + "grad_norm": 0.20519839227199554, + "learning_rate": 2.86368843069874e-06, + "loss": 0.2524, + "step": 845 + }, + { + "epoch": 1.7429894520195524, + "grad_norm": 0.22689610719680786, + "learning_rate": 2.84077892325315e-06, + "loss": 0.2502, + "step": 846 + }, + { + "epoch": 1.7450475945459223, + "grad_norm": 0.22423967719078064, + "learning_rate": 2.8178694158075602e-06, + "loss": 0.2432, + "step": 847 + }, + { + "epoch": 1.7471057370722922, + "grad_norm": 0.21444083750247955, + "learning_rate": 2.7949599083619704e-06, + "loss": 0.2501, + "step": 848 + }, + { + "epoch": 1.7491638795986622, + "grad_norm": 0.20746010541915894, + "learning_rate": 2.7720504009163806e-06, + "loss": 0.2585, + "step": 849 + }, + { + "epoch": 1.7512220221250323, + "grad_norm": 0.22796258330345154, + "learning_rate": 2.7491408934707907e-06, + "loss": 0.2482, + "step": 850 + }, + { + "epoch": 1.753280164651402, + "grad_norm": 0.21120622754096985, + "learning_rate": 2.7262313860252005e-06, + "loss": 0.2617, + "step": 851 + }, + { + "epoch": 1.7553383071777722, + "grad_norm": 0.21528108417987823, + "learning_rate": 2.7033218785796106e-06, + "loss": 0.2564, + "step": 852 + }, + { + "epoch": 1.7573964497041419, + "grad_norm": 0.2123376727104187, + "learning_rate": 2.680412371134021e-06, + "loss": 0.2505, + "step": 853 + }, + { + "epoch": 1.759454592230512, + "grad_norm": 0.2255619317293167, + "learning_rate": 2.657502863688431e-06, + "loss": 0.2598, + "step": 854 + }, + { + "epoch": 1.7615127347568817, + "grad_norm": 0.21333782374858856, + "learning_rate": 2.634593356242841e-06, + "loss": 0.2456, + "step": 855 + }, + { + "epoch": 1.763570877283252, + "grad_norm": 0.20801705121994019, + "learning_rate": 2.6116838487972513e-06, + "loss": 0.249, + "step": 856 + }, + { + "epoch": 1.7656290198096218, + "grad_norm": 0.2295520156621933, + "learning_rate": 2.5887743413516615e-06, + "loss": 0.2711, + "step": 857 + }, + { + "epoch": 1.7676871623359918, + "grad_norm": 0.21109919250011444, + "learning_rate": 2.565864833906071e-06, + "loss": 0.2484, + "step": 858 + }, + { + "epoch": 1.7697453048623617, + "grad_norm": 0.2123642861843109, + "learning_rate": 2.542955326460481e-06, + "loss": 0.2545, + "step": 859 + }, + { + "epoch": 1.7718034473887316, + "grad_norm": 0.20756429433822632, + "learning_rate": 2.520045819014891e-06, + "loss": 0.2479, + "step": 860 + }, + { + "epoch": 1.7738615899151016, + "grad_norm": 0.21249566972255707, + "learning_rate": 2.4971363115693013e-06, + "loss": 0.2473, + "step": 861 + }, + { + "epoch": 1.7759197324414715, + "grad_norm": 0.22438718378543854, + "learning_rate": 2.4742268041237115e-06, + "loss": 0.2549, + "step": 862 + }, + { + "epoch": 1.7779778749678417, + "grad_norm": 0.22067435085773468, + "learning_rate": 2.4513172966781217e-06, + "loss": 0.26, + "step": 863 + }, + { + "epoch": 1.7800360174942114, + "grad_norm": 0.2168402522802353, + "learning_rate": 2.428407789232532e-06, + "loss": 0.2541, + "step": 864 + }, + { + "epoch": 1.7820941600205815, + "grad_norm": 0.2113119214773178, + "learning_rate": 2.405498281786942e-06, + "loss": 0.2583, + "step": 865 + }, + { + "epoch": 1.7841523025469512, + "grad_norm": 0.21283333003520966, + "learning_rate": 2.3825887743413517e-06, + "loss": 0.2564, + "step": 866 + }, + { + "epoch": 1.7862104450733214, + "grad_norm": 0.21427619457244873, + "learning_rate": 2.359679266895762e-06, + "loss": 0.2533, + "step": 867 + }, + { + "epoch": 1.7882685875996913, + "grad_norm": 0.20976261794567108, + "learning_rate": 2.336769759450172e-06, + "loss": 0.2662, + "step": 868 + }, + { + "epoch": 1.7903267301260612, + "grad_norm": 0.22446084022521973, + "learning_rate": 2.3138602520045822e-06, + "loss": 0.2566, + "step": 869 + }, + { + "epoch": 1.7923848726524312, + "grad_norm": 0.21603813767433167, + "learning_rate": 2.290950744558992e-06, + "loss": 0.2427, + "step": 870 + }, + { + "epoch": 1.7944430151788011, + "grad_norm": 0.21606098115444183, + "learning_rate": 2.268041237113402e-06, + "loss": 0.2503, + "step": 871 + }, + { + "epoch": 1.796501157705171, + "grad_norm": 0.20895624160766602, + "learning_rate": 2.2451317296678123e-06, + "loss": 0.2423, + "step": 872 + }, + { + "epoch": 1.798559300231541, + "grad_norm": 0.21321886777877808, + "learning_rate": 2.222222222222222e-06, + "loss": 0.2514, + "step": 873 + }, + { + "epoch": 1.8006174427579111, + "grad_norm": 0.2091333568096161, + "learning_rate": 2.1993127147766322e-06, + "loss": 0.2593, + "step": 874 + }, + { + "epoch": 1.8026755852842808, + "grad_norm": 0.2109704166650772, + "learning_rate": 2.1764032073310424e-06, + "loss": 0.2626, + "step": 875 + }, + { + "epoch": 1.804733727810651, + "grad_norm": 0.21323198080062866, + "learning_rate": 2.1534936998854526e-06, + "loss": 0.2517, + "step": 876 + }, + { + "epoch": 1.8067918703370207, + "grad_norm": 0.21177341043949127, + "learning_rate": 2.1305841924398628e-06, + "loss": 0.2589, + "step": 877 + }, + { + "epoch": 1.8088500128633909, + "grad_norm": 0.21436013281345367, + "learning_rate": 2.107674684994273e-06, + "loss": 0.2498, + "step": 878 + }, + { + "epoch": 1.8109081553897606, + "grad_norm": 0.21496744453907013, + "learning_rate": 2.084765177548683e-06, + "loss": 0.2595, + "step": 879 + }, + { + "epoch": 1.8129662979161307, + "grad_norm": 0.21034789085388184, + "learning_rate": 2.061855670103093e-06, + "loss": 0.2494, + "step": 880 + }, + { + "epoch": 1.8150244404425007, + "grad_norm": 0.20836222171783447, + "learning_rate": 2.038946162657503e-06, + "loss": 0.2526, + "step": 881 + }, + { + "epoch": 1.8170825829688706, + "grad_norm": 0.21801823377609253, + "learning_rate": 2.016036655211913e-06, + "loss": 0.2594, + "step": 882 + }, + { + "epoch": 1.8191407254952405, + "grad_norm": 0.20607352256774902, + "learning_rate": 1.9931271477663233e-06, + "loss": 0.2636, + "step": 883 + }, + { + "epoch": 1.8211988680216105, + "grad_norm": 0.2141195684671402, + "learning_rate": 1.970217640320733e-06, + "loss": 0.2572, + "step": 884 + }, + { + "epoch": 1.8232570105479804, + "grad_norm": 0.2243940681219101, + "learning_rate": 1.9473081328751433e-06, + "loss": 0.2575, + "step": 885 + }, + { + "epoch": 1.8253151530743503, + "grad_norm": 0.20857423543930054, + "learning_rate": 1.9243986254295534e-06, + "loss": 0.2464, + "step": 886 + }, + { + "epoch": 1.8273732956007205, + "grad_norm": 0.19861185550689697, + "learning_rate": 1.9014891179839634e-06, + "loss": 0.2552, + "step": 887 + }, + { + "epoch": 1.8294314381270902, + "grad_norm": 0.2101699262857437, + "learning_rate": 1.8785796105383736e-06, + "loss": 0.2716, + "step": 888 + }, + { + "epoch": 1.8314895806534603, + "grad_norm": 0.2184407114982605, + "learning_rate": 1.8556701030927837e-06, + "loss": 0.2585, + "step": 889 + }, + { + "epoch": 1.83354772317983, + "grad_norm": 0.20467938482761383, + "learning_rate": 1.8327605956471937e-06, + "loss": 0.2548, + "step": 890 + }, + { + "epoch": 1.8356058657062002, + "grad_norm": 0.21270884573459625, + "learning_rate": 1.8098510882016038e-06, + "loss": 0.2663, + "step": 891 + }, + { + "epoch": 1.8376640082325701, + "grad_norm": 0.2190205454826355, + "learning_rate": 1.786941580756014e-06, + "loss": 0.2529, + "step": 892 + }, + { + "epoch": 1.83972215075894, + "grad_norm": 0.21621330082416534, + "learning_rate": 1.7640320733104242e-06, + "loss": 0.2587, + "step": 893 + }, + { + "epoch": 1.84178029328531, + "grad_norm": 0.22347432374954224, + "learning_rate": 1.741122565864834e-06, + "loss": 0.2547, + "step": 894 + }, + { + "epoch": 1.84383843581168, + "grad_norm": 0.20814360678195953, + "learning_rate": 1.718213058419244e-06, + "loss": 0.2431, + "step": 895 + }, + { + "epoch": 1.8458965783380499, + "grad_norm": 0.2169455736875534, + "learning_rate": 1.6953035509736543e-06, + "loss": 0.2471, + "step": 896 + }, + { + "epoch": 1.8479547208644198, + "grad_norm": 0.20700973272323608, + "learning_rate": 1.6723940435280642e-06, + "loss": 0.2532, + "step": 897 + }, + { + "epoch": 1.85001286339079, + "grad_norm": 0.2104254812002182, + "learning_rate": 1.6494845360824744e-06, + "loss": 0.2417, + "step": 898 + }, + { + "epoch": 1.8520710059171597, + "grad_norm": 0.2133847326040268, + "learning_rate": 1.6265750286368846e-06, + "loss": 0.2499, + "step": 899 + }, + { + "epoch": 1.8541291484435298, + "grad_norm": 0.21578392386436462, + "learning_rate": 1.6036655211912947e-06, + "loss": 0.252, + "step": 900 + }, + { + "epoch": 1.8541291484435298, + "eval_loss": 0.2747899889945984, + "eval_runtime": 2422.9271, + "eval_samples_per_second": 3.209, + "eval_steps_per_second": 0.802, + "step": 900 + }, + { + "epoch": 1.8561872909698995, + "grad_norm": 0.21070148050785065, + "learning_rate": 1.5807560137457045e-06, + "loss": 0.2587, + "step": 901 + }, + { + "epoch": 1.8582454334962697, + "grad_norm": 0.220760315656662, + "learning_rate": 1.5578465063001146e-06, + "loss": 0.2579, + "step": 902 + }, + { + "epoch": 1.8603035760226394, + "grad_norm": 0.21288466453552246, + "learning_rate": 1.5349369988545248e-06, + "loss": 0.2602, + "step": 903 + }, + { + "epoch": 1.8623617185490096, + "grad_norm": 0.21258701384067535, + "learning_rate": 1.5120274914089348e-06, + "loss": 0.2496, + "step": 904 + }, + { + "epoch": 1.8644198610753795, + "grad_norm": 0.22993379831314087, + "learning_rate": 1.489117983963345e-06, + "loss": 0.2563, + "step": 905 + }, + { + "epoch": 1.8664780036017494, + "grad_norm": 0.23294076323509216, + "learning_rate": 1.4662084765177551e-06, + "loss": 0.2487, + "step": 906 + }, + { + "epoch": 1.8685361461281194, + "grad_norm": 0.2120734453201294, + "learning_rate": 1.4432989690721649e-06, + "loss": 0.2542, + "step": 907 + }, + { + "epoch": 1.8705942886544893, + "grad_norm": 0.21382124722003937, + "learning_rate": 1.420389461626575e-06, + "loss": 0.2506, + "step": 908 + }, + { + "epoch": 1.8726524311808592, + "grad_norm": 0.21298156678676605, + "learning_rate": 1.3974799541809852e-06, + "loss": 0.253, + "step": 909 + }, + { + "epoch": 1.8747105737072292, + "grad_norm": 0.21141862869262695, + "learning_rate": 1.3745704467353954e-06, + "loss": 0.25, + "step": 910 + }, + { + "epoch": 1.8767687162335993, + "grad_norm": 0.2200550138950348, + "learning_rate": 1.3516609392898053e-06, + "loss": 0.2591, + "step": 911 + }, + { + "epoch": 1.878826858759969, + "grad_norm": 0.21004612743854523, + "learning_rate": 1.3287514318442155e-06, + "loss": 0.2409, + "step": 912 + }, + { + "epoch": 1.8808850012863392, + "grad_norm": 0.20979297161102295, + "learning_rate": 1.3058419243986257e-06, + "loss": 0.2615, + "step": 913 + }, + { + "epoch": 1.8829431438127089, + "grad_norm": 0.2132391333580017, + "learning_rate": 1.2829324169530354e-06, + "loss": 0.2655, + "step": 914 + }, + { + "epoch": 1.885001286339079, + "grad_norm": 0.2096961885690689, + "learning_rate": 1.2600229095074456e-06, + "loss": 0.2531, + "step": 915 + }, + { + "epoch": 1.887059428865449, + "grad_norm": 0.23014317452907562, + "learning_rate": 1.2371134020618557e-06, + "loss": 0.2446, + "step": 916 + }, + { + "epoch": 1.889117571391819, + "grad_norm": 0.2189033031463623, + "learning_rate": 1.214203894616266e-06, + "loss": 0.2583, + "step": 917 + }, + { + "epoch": 1.8911757139181888, + "grad_norm": 0.20889656245708466, + "learning_rate": 1.1912943871706759e-06, + "loss": 0.2545, + "step": 918 + }, + { + "epoch": 1.8932338564445588, + "grad_norm": 0.20667313039302826, + "learning_rate": 1.168384879725086e-06, + "loss": 0.2517, + "step": 919 + }, + { + "epoch": 1.8952919989709287, + "grad_norm": 0.2061508297920227, + "learning_rate": 1.145475372279496e-06, + "loss": 0.2506, + "step": 920 + }, + { + "epoch": 1.8973501414972986, + "grad_norm": 0.21874327957630157, + "learning_rate": 1.1225658648339062e-06, + "loss": 0.2613, + "step": 921 + }, + { + "epoch": 1.8994082840236688, + "grad_norm": 0.21179988980293274, + "learning_rate": 1.0996563573883161e-06, + "loss": 0.2445, + "step": 922 + }, + { + "epoch": 1.9014664265500385, + "grad_norm": 0.20976871252059937, + "learning_rate": 1.0767468499427263e-06, + "loss": 0.2512, + "step": 923 + }, + { + "epoch": 1.9035245690764087, + "grad_norm": 0.2153770625591278, + "learning_rate": 1.0538373424971365e-06, + "loss": 0.242, + "step": 924 + }, + { + "epoch": 1.9055827116027784, + "grad_norm": 0.22022761404514313, + "learning_rate": 1.0309278350515464e-06, + "loss": 0.2495, + "step": 925 + }, + { + "epoch": 1.9076408541291485, + "grad_norm": 0.20028620958328247, + "learning_rate": 1.0080183276059566e-06, + "loss": 0.2424, + "step": 926 + }, + { + "epoch": 1.9096989966555182, + "grad_norm": 0.21675117313861847, + "learning_rate": 9.851088201603665e-07, + "loss": 0.2547, + "step": 927 + }, + { + "epoch": 1.9117571391818884, + "grad_norm": 0.21667635440826416, + "learning_rate": 9.621993127147767e-07, + "loss": 0.2494, + "step": 928 + }, + { + "epoch": 1.9138152817082583, + "grad_norm": 0.212045818567276, + "learning_rate": 9.392898052691868e-07, + "loss": 0.2582, + "step": 929 + }, + { + "epoch": 1.9158734242346283, + "grad_norm": 0.20664039254188538, + "learning_rate": 9.163802978235968e-07, + "loss": 0.2498, + "step": 930 + }, + { + "epoch": 1.9179315667609982, + "grad_norm": 0.20585620403289795, + "learning_rate": 8.93470790378007e-07, + "loss": 0.2479, + "step": 931 + }, + { + "epoch": 1.9199897092873681, + "grad_norm": 0.21589615941047668, + "learning_rate": 8.70561282932417e-07, + "loss": 0.2587, + "step": 932 + }, + { + "epoch": 1.922047851813738, + "grad_norm": 0.2254379540681839, + "learning_rate": 8.476517754868271e-07, + "loss": 0.2349, + "step": 933 + }, + { + "epoch": 1.924105994340108, + "grad_norm": 0.21472762525081635, + "learning_rate": 8.247422680412372e-07, + "loss": 0.243, + "step": 934 + }, + { + "epoch": 1.9261641368664781, + "grad_norm": 0.2058337926864624, + "learning_rate": 8.018327605956474e-07, + "loss": 0.2508, + "step": 935 + }, + { + "epoch": 1.9282222793928478, + "grad_norm": 0.21240606904029846, + "learning_rate": 7.789232531500573e-07, + "loss": 0.2526, + "step": 936 + }, + { + "epoch": 1.930280421919218, + "grad_norm": 0.218268483877182, + "learning_rate": 7.560137457044674e-07, + "loss": 0.2573, + "step": 937 + }, + { + "epoch": 1.9323385644455877, + "grad_norm": 0.21556495130062103, + "learning_rate": 7.331042382588776e-07, + "loss": 0.2567, + "step": 938 + }, + { + "epoch": 1.9343967069719579, + "grad_norm": 0.21113666892051697, + "learning_rate": 7.101947308132875e-07, + "loss": 0.255, + "step": 939 + }, + { + "epoch": 1.9364548494983278, + "grad_norm": 0.23069624602794647, + "learning_rate": 6.872852233676977e-07, + "loss": 0.248, + "step": 940 + }, + { + "epoch": 1.9385129920246977, + "grad_norm": 0.22153766453266144, + "learning_rate": 6.643757159221077e-07, + "loss": 0.2521, + "step": 941 + }, + { + "epoch": 1.9405711345510677, + "grad_norm": 0.21001935005187988, + "learning_rate": 6.414662084765177e-07, + "loss": 0.2522, + "step": 942 + }, + { + "epoch": 1.9426292770774376, + "grad_norm": 0.2235831767320633, + "learning_rate": 6.185567010309279e-07, + "loss": 0.265, + "step": 943 + }, + { + "epoch": 1.9446874196038075, + "grad_norm": 0.21761031448841095, + "learning_rate": 5.956471935853379e-07, + "loss": 0.2517, + "step": 944 + }, + { + "epoch": 1.9467455621301775, + "grad_norm": 0.2145112156867981, + "learning_rate": 5.72737686139748e-07, + "loss": 0.2356, + "step": 945 + }, + { + "epoch": 1.9488037046565476, + "grad_norm": 0.21806856989860535, + "learning_rate": 5.498281786941581e-07, + "loss": 0.2528, + "step": 946 + }, + { + "epoch": 1.9508618471829173, + "grad_norm": 0.21642406284809113, + "learning_rate": 5.269186712485682e-07, + "loss": 0.2534, + "step": 947 + }, + { + "epoch": 1.9529199897092875, + "grad_norm": 0.20829197764396667, + "learning_rate": 5.040091638029783e-07, + "loss": 0.2586, + "step": 948 + }, + { + "epoch": 1.9549781322356572, + "grad_norm": 0.20364080369472504, + "learning_rate": 4.810996563573884e-07, + "loss": 0.2592, + "step": 949 + }, + { + "epoch": 1.9570362747620274, + "grad_norm": 0.22106331586837769, + "learning_rate": 4.581901489117984e-07, + "loss": 0.2527, + "step": 950 + }, + { + "epoch": 1.959094417288397, + "grad_norm": 0.21560247242450714, + "learning_rate": 4.352806414662085e-07, + "loss": 0.2537, + "step": 951 + }, + { + "epoch": 1.9611525598147672, + "grad_norm": 0.2174178659915924, + "learning_rate": 4.123711340206186e-07, + "loss": 0.2556, + "step": 952 + }, + { + "epoch": 1.9632107023411371, + "grad_norm": 0.2129729986190796, + "learning_rate": 3.8946162657502866e-07, + "loss": 0.2633, + "step": 953 + }, + { + "epoch": 1.965268844867507, + "grad_norm": 0.21184976398944855, + "learning_rate": 3.665521191294388e-07, + "loss": 0.2603, + "step": 954 + }, + { + "epoch": 1.967326987393877, + "grad_norm": 0.2161017656326294, + "learning_rate": 3.4364261168384884e-07, + "loss": 0.2447, + "step": 955 + }, + { + "epoch": 1.969385129920247, + "grad_norm": 0.21043328940868378, + "learning_rate": 3.2073310423825885e-07, + "loss": 0.259, + "step": 956 + }, + { + "epoch": 1.9714432724466169, + "grad_norm": 0.2055181860923767, + "learning_rate": 2.9782359679266897e-07, + "loss": 0.2612, + "step": 957 + }, + { + "epoch": 1.9735014149729868, + "grad_norm": 0.2101728767156601, + "learning_rate": 2.7491408934707903e-07, + "loss": 0.2616, + "step": 958 + }, + { + "epoch": 1.975559557499357, + "grad_norm": 0.22248664498329163, + "learning_rate": 2.5200458190148915e-07, + "loss": 0.2599, + "step": 959 + }, + { + "epoch": 1.9776177000257267, + "grad_norm": 0.21483425796031952, + "learning_rate": 2.290950744558992e-07, + "loss": 0.2391, + "step": 960 + }, + { + "epoch": 1.9796758425520968, + "grad_norm": 0.20977045595645905, + "learning_rate": 2.061855670103093e-07, + "loss": 0.2518, + "step": 961 + }, + { + "epoch": 1.9817339850784665, + "grad_norm": 0.2119477540254593, + "learning_rate": 1.832760595647194e-07, + "loss": 0.2453, + "step": 962 + }, + { + "epoch": 1.9837921276048367, + "grad_norm": 0.2160489857196808, + "learning_rate": 1.6036655211912943e-07, + "loss": 0.2597, + "step": 963 + }, + { + "epoch": 1.9858502701312066, + "grad_norm": 0.21540690958499908, + "learning_rate": 1.3745704467353952e-07, + "loss": 0.2532, + "step": 964 + }, + { + "epoch": 1.9879084126575766, + "grad_norm": 0.21961499750614166, + "learning_rate": 1.145475372279496e-07, + "loss": 0.254, + "step": 965 + }, + { + "epoch": 1.9899665551839465, + "grad_norm": 0.21246473491191864, + "learning_rate": 9.16380297823597e-08, + "loss": 0.2568, + "step": 966 + }, + { + "epoch": 1.9920246977103164, + "grad_norm": 0.21053671836853027, + "learning_rate": 6.872852233676976e-08, + "loss": 0.248, + "step": 967 + }, + { + "epoch": 1.9940828402366864, + "grad_norm": 0.2094959020614624, + "learning_rate": 4.581901489117985e-08, + "loss": 0.2479, + "step": 968 + }, + { + "epoch": 1.9961409827630563, + "grad_norm": 0.21769192814826965, + "learning_rate": 2.2909507445589924e-08, + "loss": 0.2472, + "step": 969 + }, + { + "epoch": 1.9981991252894264, + "grad_norm": 0.21066489815711975, + "learning_rate": 0.0, + "loss": 0.2607, + "step": 970 + } + ], + "logging_steps": 1, + "max_steps": 970, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.205063356630909e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/training_args.bin b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5999c7ee9dd10ee9076d748e4757533e635fa832 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/checkpoint-970/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee55a11f5a306eb7c39b536fdfe2459bc279e468da50f6adda478c4deffcb812 +size 5688 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/special_tokens_map.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/tokenizer.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..45a5e23f54141c5f4f97a8d58f3ffadc28e287ba --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d964a2c8346d40f95791533eae48730d5f163c2e65fd16333560fd3e661df318 +size 34362915 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/tokenizer.model b/codegemma_instruct_cot_ft_lora_r64_alpha64/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..71a98ce40269d847e58957e1e070d9ae8eb184af --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:583f2ebd2a1936009b7da991ea255504db68c7a9713a78673d1335a87098966c +size 4241023 diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/tokenizer_config.json b/codegemma_instruct_cot_ft_lora_r64_alpha64/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9b9b1b4acdd4afcedae39d1cf6f0bc7ef7d9910f --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/tokenizer_config.json @@ -0,0 +1,2011 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "<|file_separator|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "left", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/codegemma_instruct_cot_ft_lora_r64_alpha64/training_args.bin b/codegemma_instruct_cot_ft_lora_r64_alpha64/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5999c7ee9dd10ee9076d748e4757533e635fa832 --- /dev/null +++ b/codegemma_instruct_cot_ft_lora_r64_alpha64/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee55a11f5a306eb7c39b536fdfe2459bc279e468da50f6adda478c4deffcb812 +size 5688