hf-user commited on
Commit
7c8d415
·
1 Parent(s): daaa48c

0821 Upload LoRA

Browse files
README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # gpt-oss-20b LoRA
adapter_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": {
4
+ "base_model_class": "GptOssForCausalLM",
5
+ "parent_library": "transformers.models.gpt_oss.modeling_gpt_oss"
6
+ },
7
+ "base_model_name_or_path": "unsloth/gpt-oss-20b-unsloth-bnb-4bit",
8
+ "bias": "none",
9
+ "corda_config": null,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "qalora_group_size": 16,
27
+ "r": 64,
28
+ "rank_pattern": {},
29
+ "revision": null,
30
+ "target_modules": "(?:.*?(?:vision|image|visual|patch|language|text).*?(?:self_attn|attention|attn|mlp|feed_forward|ffn|dense).*?(?:q_proj|k_proj|v_proj|o_proj|linear|0|1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31).*?)|(?:\\bmodel\\.layers\\.[\\d]{1,}\\.(?:self_attn|attention|attn|mlp|feed_forward|ffn|dense)\\.(?:(?:q_proj|k_proj|v_proj|o_proj|linear|0|1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31)))",
31
+ "target_parameters": null,
32
+ "task_type": null,
33
+ "trainable_token_indices": null,
34
+ "use_dora": false,
35
+ "use_qalora": false,
36
+ "use_rslora": false
37
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:414df4075b7595bf56271e9ff50ba872ce5fed05ad3924bf14a8a78ebc6b8ebf
3
+ size 127427864
chat_template.jinja ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #}
2
+ {#-
3
+ In addition to the normal inputs of `messages` and `tools`, this template also accepts the
4
+ following kwargs:
5
+ - "builtin_tools": A list, can contain "browser" and/or "python".
6
+ - "model_identity": A string that optionally describes the model identity.
7
+ - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium".
8
+ #}
9
+
10
+ {#- Tool Definition Rendering ============================================== #}
11
+ {%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%}
12
+ {%- if param_spec.type == "array" -%}
13
+ {%- if param_spec['items'] -%}
14
+ {%- if param_spec['items']['type'] == "string" -%}
15
+ {{- "string[]" }}
16
+ {%- elif param_spec['items']['type'] == "number" -%}
17
+ {{- "number[]" }}
18
+ {%- elif param_spec['items']['type'] == "integer" -%}
19
+ {{- "number[]" }}
20
+ {%- elif param_spec['items']['type'] == "boolean" -%}
21
+ {{- "boolean[]" }}
22
+ {%- else -%}
23
+ {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%}
24
+ {%- if inner_type == "object | object" or inner_type|length > 50 -%}
25
+ {{- "any[]" }}
26
+ {%- else -%}
27
+ {{- inner_type + "[]" }}
28
+ {%- endif -%}
29
+ {%- endif -%}
30
+ {%- if param_spec.nullable -%}
31
+ {{- " | null" }}
32
+ {%- endif -%}
33
+ {%- else -%}
34
+ {{- "any[]" }}
35
+ {%- if param_spec.nullable -%}
36
+ {{- " | null" }}
37
+ {%- endif -%}
38
+ {%- endif -%}
39
+ {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%}
40
+ {#- Handle array of types like ["object", "object"] from Union[dict, list] #}
41
+ {%- if param_spec.type | length > 1 -%}
42
+ {{- param_spec.type | join(" | ") }}
43
+ {%- else -%}
44
+ {{- param_spec.type[0] }}
45
+ {%- endif -%}
46
+ {%- elif param_spec.oneOf -%}
47
+ {#- Handle oneOf schemas - check for complex unions and fallback to any #}
48
+ {%- set has_object_variants = false -%}
49
+ {%- for variant in param_spec.oneOf -%}
50
+ {%- if variant.type == "object" -%}
51
+ {%- set has_object_variants = true -%}
52
+ {%- endif -%}
53
+ {%- endfor -%}
54
+ {%- if has_object_variants and param_spec.oneOf|length > 1 -%}
55
+ {{- "any" }}
56
+ {%- else -%}
57
+ {%- for variant in param_spec.oneOf -%}
58
+ {{- render_typescript_type(variant, required_params) -}}
59
+ {%- if variant.description %}
60
+ {{- "// " + variant.description }}
61
+ {%- endif -%}
62
+ {%- if variant.default is defined %}
63
+ {{ "// default: " + variant.default|tojson }}
64
+ {%- endif -%}
65
+ {%- if not loop.last %}
66
+ {{- " | " }}
67
+ {% endif -%}
68
+ {%- endfor -%}
69
+ {%- endif -%}
70
+ {%- elif param_spec.type == "string" -%}
71
+ {%- if param_spec.enum -%}
72
+ {{- '"' + param_spec.enum|join('" | "') + '"' -}}
73
+ {%- else -%}
74
+ {{- "string" }}
75
+ {%- if param_spec.nullable %}
76
+ {{- " | null" }}
77
+ {%- endif -%}
78
+ {%- endif -%}
79
+ {%- elif param_spec.type == "number" -%}
80
+ {{- "number" }}
81
+ {%- elif param_spec.type == "integer" -%}
82
+ {{- "number" }}
83
+ {%- elif param_spec.type == "boolean" -%}
84
+ {{- "boolean" }}
85
+
86
+ {%- elif param_spec.type == "object" -%}
87
+ {%- if param_spec.properties -%}
88
+ {{- "{\n" }}
89
+ {%- for prop_name, prop_spec in param_spec.properties.items() -%}
90
+ {{- prop_name -}}
91
+ {%- if prop_name not in (param_spec.required or []) -%}
92
+ {{- "?" }}
93
+ {%- endif -%}
94
+ {{- ": " }}
95
+ {{ render_typescript_type(prop_spec, param_spec.required or []) }}
96
+ {%- if not loop.last -%}
97
+ {{-", " }}
98
+ {%- endif -%}
99
+ {%- endfor -%}
100
+ {{- "}" }}
101
+ {%- else -%}
102
+ {{- "object" }}
103
+ {%- endif -%}
104
+ {%- else -%}
105
+ {{- "any" }}
106
+ {%- endif -%}
107
+ {%- endmacro -%}
108
+
109
+ {%- macro render_tool_namespace(namespace_name, tools) -%}
110
+ {{- "## " + namespace_name + "\n\n" }}
111
+ {{- "namespace " + namespace_name + " {\n\n" }}
112
+ {%- for tool in tools %}
113
+ {%- set tool = tool.function %}
114
+ {{- "// " + tool.description + "\n" }}
115
+ {{- "type "+ tool.name + " = " }}
116
+ {%- if tool.parameters and tool.parameters.properties -%}
117
+ {{- "(_: " }}
118
+ {{- "{\n" }}
119
+ {%- for param_name, param_spec in tool.parameters.properties.items() %}
120
+ {{- "// " + param_spec.description + "\n" }}
121
+ {{- param_name }}
122
+ {%- if param_name not in (tool.parameters.required or []) -%}
123
+ {{- "?" }}
124
+ {%- endif -%}
125
+ {{- ": " }}
126
+ {{- render_typescript_type(param_spec, tool.parameters.required or []) }}
127
+ {%- if param_spec.default is defined -%}
128
+ {%- if param_spec.enum %}
129
+ {{- ", // default: " + param_spec.default }}
130
+ {%- elif param_spec.oneOf %}
131
+ {{- "// default: " + param_spec.default }}
132
+ {%- else %}
133
+ {{- ", // default: " + param_spec.default|tojson }}
134
+ {%- endif -%}
135
+ {%- endif -%}
136
+ {%- if not loop.last %}
137
+ {{- ",\n" }}
138
+ {%- else %}
139
+ {{- "\n" }}
140
+ {%- endif -%}
141
+ {%- endfor %}
142
+ {{- "}) => any;\n\n" }}
143
+ {%- else -%}
144
+ {{- "() => any;\n\n" }}
145
+ {%- endif -%}
146
+ {%- endfor %}
147
+ {{- "} // namespace " + namespace_name }}
148
+ {%- endmacro -%}
149
+
150
+ {%- macro render_builtin_tools(browser_tool, python_tool) -%}
151
+ {%- if browser_tool %}
152
+ {{- "## browser\n\n" }}
153
+ {{- "// Tool for browsing.\n" }}
154
+ {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }}
155
+ {{- "// Cite information from the tool using the following format:\n" }}
156
+ {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }}
157
+ {{- "// Do not quote more than 10 words directly from the tool output.\n" }}
158
+ {{- "// sources=web (default: web)\n" }}
159
+ {{- "namespace browser {\n\n" }}
160
+ {{- "// Searches for information related to `query` and displays `topn` results.\n" }}
161
+ {{- "type search = (_: {\n" }}
162
+ {{- "query: string,\n" }}
163
+ {{- "topn?: number, // default: 10\n" }}
164
+ {{- "source?: string,\n" }}
165
+ {{- "}) => any;\n\n" }}
166
+ {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }}
167
+ {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }}
168
+ {{- "// If `cursor` is not provided, the most recent page is implied.\n" }}
169
+ {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }}
170
+ {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }}
171
+ {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }}
172
+ {{- "type open = (_: {\n" }}
173
+ {{- "id?: number | string, // default: -1\n" }}
174
+ {{- "cursor?: number, // default: -1\n" }}
175
+ {{- "loc?: number, // default: -1\n" }}
176
+ {{- "num_lines?: number, // default: -1\n" }}
177
+ {{- "view_source?: boolean, // default: false\n" }}
178
+ {{- "source?: string,\n" }}
179
+ {{- "}) => any;\n\n" }}
180
+ {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }}
181
+ {{- "type find = (_: {\n" }}
182
+ {{- "pattern: string,\n" }}
183
+ {{- "cursor?: number, // default: -1\n" }}
184
+ {{- "}) => any;\n\n" }}
185
+ {{- "} // namespace browser\n\n" }}
186
+ {%- endif -%}
187
+
188
+ {%- if python_tool %}
189
+ {{- "## python\n\n" }}
190
+ {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }}
191
+ {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }}
192
+ {%- endif -%}
193
+ {%- endmacro -%}
194
+
195
+ {#- System Message Construction ============================================ #}
196
+ {%- macro build_system_message() -%}
197
+ {%- if model_identity is not defined %}
198
+ {{- "You are ChatGPT, a large language model trained by OpenAI.\n" -}}
199
+ {%- else %}
200
+ {{- model_identity }}
201
+ {%- endif %}
202
+ {{- "Knowledge cutoff: 2024-06\n" }}
203
+ {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }}
204
+ {%- if reasoning_effort is not defined %}
205
+ {%- set reasoning_effort = "medium" %}
206
+ {%- endif %}
207
+ {{- "Reasoning: " + reasoning_effort + "\n\n" }}
208
+ {%- if builtin_tools is defined %}
209
+ {{- "# Tools\n\n" }}
210
+ {%- set available_builtin_tools = namespace(browser=false, python=false) %}
211
+ {%- for tool in builtin_tools %}
212
+ {%- if tool == "browser" %}
213
+ {%- set available_builtin_tools.browser = true %}
214
+ {%- elif tool == "python" %}
215
+ {%- set available_builtin_tools.python = true %}
216
+ {%- endif %}
217
+ {%- endfor %}
218
+ {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }}
219
+ {%- endif -%}
220
+ {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }}
221
+ {%- if tools is defined -%}
222
+ {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }}
223
+ {%- endif -%}
224
+ {%- endmacro -%}
225
+
226
+ {#- Main Template Logic ================================================= #}
227
+ {#- Set defaults #}
228
+
229
+ {#- Render system message #}
230
+ {{- "<|start|>system<|message|>" }}
231
+ {{- build_system_message() }}
232
+ {{- "<|end|>" }}
233
+
234
+ {#- Extract developer message #}
235
+ {%- if messages[0].role == "developer" or messages[0].role == "system" %}
236
+ {%- set developer_message = messages[0].content %}
237
+ {%- set loop_messages = messages[1:] %}
238
+ {%- else %}
239
+ {%- set developer_message = "" %}
240
+ {%- set loop_messages = messages %}
241
+ {%- endif %}
242
+
243
+ {#- Render developer message #}
244
+ {%- if developer_message or tools %}
245
+ {{- "<|start|>developer<|message|>" }}
246
+ {%- if developer_message %}
247
+ {{- "# Instructions\n\n" }}
248
+ {{- developer_message }}
249
+ {%- endif %}
250
+ {%- if tools -%}
251
+ {{- "\n\n" }}
252
+ {{- "# Tools\n\n" }}
253
+ {{- render_tool_namespace("functions", tools) }}
254
+ {%- endif -%}
255
+ {{- "<|end|>" }}
256
+ {%- endif %}
257
+
258
+ {#- Render messages #}
259
+ {%- set last_tool_call = namespace(name=none) %}
260
+ {%- for message in loop_messages -%}
261
+ {#- At this point only assistant/user/tool messages should remain #}
262
+ {%- if message.role == 'assistant' -%}
263
+ {%- if "tool_calls" in message %}
264
+ {#- We assume max 1 tool call per message, and so we infer the tool call name #}
265
+ {#- in "tool" messages from the most recent assistant tool call name #}
266
+ {%- set tool_call = message.tool_calls[0] %}
267
+ {%- if tool_call.function %}
268
+ {%- set tool_call = tool_call.function %}
269
+ {%- endif %}
270
+ {%- if message.content %}
271
+ {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }}
272
+ {%- endif %}
273
+ {{- "<|start|>assistant to=" }}
274
+ {{- "functions." + tool_call.name + "<|channel|>commentary json<|message|>" }}
275
+ {{- tool_call.arguments|tojson }}
276
+ {{- "<|call|>" }}
277
+ {%- set last_tool_call.name = tool_call.name %}
278
+ {%- elif "thinking" in message and loop.last and not add_generation_prompt %}
279
+ {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #}
280
+ {#- This is a situation that should only occur in training, never in inference. #}
281
+ {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }}
282
+ {#- <|return|> indicates the end of generation, but <|end|> does not #}
283
+ {#- <|return|> should never be an input to the model, but we include it as the final token #}
284
+ {#- when training, so the model learns to emit it. #}
285
+ {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }}
286
+ {%- set last_tool_call.name = none %}
287
+ {%- elif "thinking" in message %}
288
+ {#- CoT is dropped during all previous turns, so we never render it for inference #}
289
+ {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }}
290
+ {%- set last_tool_call.name = none %}
291
+ {%- elif loop.last and not add_generation_prompt %}
292
+ {#- <|return|> indicates the end of generation, but <|end|> does not #}
293
+ {#- <|return|> should never be an input to the model, but we include it as the final token #}
294
+ {#- when training, so the model learns to emit it. #}
295
+ {{- "<|start|>assistant<|message|>" + message.content + "<|return|>" }}
296
+ {%- else %}
297
+ {{- "<|start|>assistant<|message|>" + message.content + "<|end|>" }}
298
+ {%- set last_tool_call.name = none %}
299
+ {%- endif %}
300
+ {%- elif message.role == 'tool' -%}
301
+ {%- if last_tool_call.name is none %}
302
+ {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
303
+ {%- endif %}
304
+ {{- "<|start|>functions." + last_tool_call.name }}
305
+ {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }}
306
+ {%- else -%}
307
+ {{- "<|start|>user<|message|>" + message.content + "<|end|>" }}
308
+ {%- endif -%}
309
+ {%- endfor -%}
310
+
311
+ {#- Generation prompt #}
312
+ {%- if add_generation_prompt -%}
313
+ <|start|>assistant
314
+ {%- endif -%}
315
+ {# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #}
checkpoint-3200/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/gpt-oss-20b-unsloth-bnb-4bit
3
+ library_name: peft
4
+ tags:
5
+ - base_model:adapter:unsloth/gpt-oss-20b-unsloth-bnb-4bit
6
+ - lora
7
+ - sft
8
+ - transformers
9
+ - trl
10
+ - unsloth
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.17.0
checkpoint-3200/adapter_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": {
4
+ "base_model_class": "GptOssForCausalLM",
5
+ "parent_library": "transformers.models.gpt_oss.modeling_gpt_oss"
6
+ },
7
+ "base_model_name_or_path": "unsloth/gpt-oss-20b-unsloth-bnb-4bit",
8
+ "bias": "none",
9
+ "corda_config": null,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "qalora_group_size": 16,
27
+ "r": 64,
28
+ "rank_pattern": {},
29
+ "revision": null,
30
+ "target_modules": "(?:.*?(?:vision|image|visual|patch|language|text).*?(?:self_attn|attention|attn|mlp|feed_forward|ffn|dense).*?(?:q_proj|k_proj|v_proj|o_proj|linear|0|1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31).*?)|(?:\\bmodel\\.layers\\.[\\d]{1,}\\.(?:self_attn|attention|attn|mlp|feed_forward|ffn|dense)\\.(?:(?:q_proj|k_proj|v_proj|o_proj|linear|0|1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31)))",
31
+ "target_parameters": null,
32
+ "task_type": null,
33
+ "trainable_token_indices": null,
34
+ "use_dora": false,
35
+ "use_qalora": false,
36
+ "use_rslora": false
37
+ }
checkpoint-3200/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:829b9b2994af185d19c1ba9a8ac6e521d4ad898db58cd573a259ccad97310104
3
+ size 127427864
checkpoint-3200/chat_template.jinja ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #}
2
+ {#-
3
+ In addition to the normal inputs of `messages` and `tools`, this template also accepts the
4
+ following kwargs:
5
+ - "builtin_tools": A list, can contain "browser" and/or "python".
6
+ - "model_identity": A string that optionally describes the model identity.
7
+ - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium".
8
+ #}
9
+
10
+ {#- Tool Definition Rendering ============================================== #}
11
+ {%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%}
12
+ {%- if param_spec.type == "array" -%}
13
+ {%- if param_spec['items'] -%}
14
+ {%- if param_spec['items']['type'] == "string" -%}
15
+ {{- "string[]" }}
16
+ {%- elif param_spec['items']['type'] == "number" -%}
17
+ {{- "number[]" }}
18
+ {%- elif param_spec['items']['type'] == "integer" -%}
19
+ {{- "number[]" }}
20
+ {%- elif param_spec['items']['type'] == "boolean" -%}
21
+ {{- "boolean[]" }}
22
+ {%- else -%}
23
+ {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%}
24
+ {%- if inner_type == "object | object" or inner_type|length > 50 -%}
25
+ {{- "any[]" }}
26
+ {%- else -%}
27
+ {{- inner_type + "[]" }}
28
+ {%- endif -%}
29
+ {%- endif -%}
30
+ {%- if param_spec.nullable -%}
31
+ {{- " | null" }}
32
+ {%- endif -%}
33
+ {%- else -%}
34
+ {{- "any[]" }}
35
+ {%- if param_spec.nullable -%}
36
+ {{- " | null" }}
37
+ {%- endif -%}
38
+ {%- endif -%}
39
+ {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%}
40
+ {#- Handle array of types like ["object", "object"] from Union[dict, list] #}
41
+ {%- if param_spec.type | length > 1 -%}
42
+ {{- param_spec.type | join(" | ") }}
43
+ {%- else -%}
44
+ {{- param_spec.type[0] }}
45
+ {%- endif -%}
46
+ {%- elif param_spec.oneOf -%}
47
+ {#- Handle oneOf schemas - check for complex unions and fallback to any #}
48
+ {%- set has_object_variants = false -%}
49
+ {%- for variant in param_spec.oneOf -%}
50
+ {%- if variant.type == "object" -%}
51
+ {%- set has_object_variants = true -%}
52
+ {%- endif -%}
53
+ {%- endfor -%}
54
+ {%- if has_object_variants and param_spec.oneOf|length > 1 -%}
55
+ {{- "any" }}
56
+ {%- else -%}
57
+ {%- for variant in param_spec.oneOf -%}
58
+ {{- render_typescript_type(variant, required_params) -}}
59
+ {%- if variant.description %}
60
+ {{- "// " + variant.description }}
61
+ {%- endif -%}
62
+ {%- if variant.default is defined %}
63
+ {{ "// default: " + variant.default|tojson }}
64
+ {%- endif -%}
65
+ {%- if not loop.last %}
66
+ {{- " | " }}
67
+ {% endif -%}
68
+ {%- endfor -%}
69
+ {%- endif -%}
70
+ {%- elif param_spec.type == "string" -%}
71
+ {%- if param_spec.enum -%}
72
+ {{- '"' + param_spec.enum|join('" | "') + '"' -}}
73
+ {%- else -%}
74
+ {{- "string" }}
75
+ {%- if param_spec.nullable %}
76
+ {{- " | null" }}
77
+ {%- endif -%}
78
+ {%- endif -%}
79
+ {%- elif param_spec.type == "number" -%}
80
+ {{- "number" }}
81
+ {%- elif param_spec.type == "integer" -%}
82
+ {{- "number" }}
83
+ {%- elif param_spec.type == "boolean" -%}
84
+ {{- "boolean" }}
85
+
86
+ {%- elif param_spec.type == "object" -%}
87
+ {%- if param_spec.properties -%}
88
+ {{- "{\n" }}
89
+ {%- for prop_name, prop_spec in param_spec.properties.items() -%}
90
+ {{- prop_name -}}
91
+ {%- if prop_name not in (param_spec.required or []) -%}
92
+ {{- "?" }}
93
+ {%- endif -%}
94
+ {{- ": " }}
95
+ {{ render_typescript_type(prop_spec, param_spec.required or []) }}
96
+ {%- if not loop.last -%}
97
+ {{-", " }}
98
+ {%- endif -%}
99
+ {%- endfor -%}
100
+ {{- "}" }}
101
+ {%- else -%}
102
+ {{- "object" }}
103
+ {%- endif -%}
104
+ {%- else -%}
105
+ {{- "any" }}
106
+ {%- endif -%}
107
+ {%- endmacro -%}
108
+
109
+ {%- macro render_tool_namespace(namespace_name, tools) -%}
110
+ {{- "## " + namespace_name + "\n\n" }}
111
+ {{- "namespace " + namespace_name + " {\n\n" }}
112
+ {%- for tool in tools %}
113
+ {%- set tool = tool.function %}
114
+ {{- "// " + tool.description + "\n" }}
115
+ {{- "type "+ tool.name + " = " }}
116
+ {%- if tool.parameters and tool.parameters.properties -%}
117
+ {{- "(_: " }}
118
+ {{- "{\n" }}
119
+ {%- for param_name, param_spec in tool.parameters.properties.items() %}
120
+ {{- "// " + param_spec.description + "\n" }}
121
+ {{- param_name }}
122
+ {%- if param_name not in (tool.parameters.required or []) -%}
123
+ {{- "?" }}
124
+ {%- endif -%}
125
+ {{- ": " }}
126
+ {{- render_typescript_type(param_spec, tool.parameters.required or []) }}
127
+ {%- if param_spec.default is defined -%}
128
+ {%- if param_spec.enum %}
129
+ {{- ", // default: " + param_spec.default }}
130
+ {%- elif param_spec.oneOf %}
131
+ {{- "// default: " + param_spec.default }}
132
+ {%- else %}
133
+ {{- ", // default: " + param_spec.default|tojson }}
134
+ {%- endif -%}
135
+ {%- endif -%}
136
+ {%- if not loop.last %}
137
+ {{- ",\n" }}
138
+ {%- else %}
139
+ {{- "\n" }}
140
+ {%- endif -%}
141
+ {%- endfor %}
142
+ {{- "}) => any;\n\n" }}
143
+ {%- else -%}
144
+ {{- "() => any;\n\n" }}
145
+ {%- endif -%}
146
+ {%- endfor %}
147
+ {{- "} // namespace " + namespace_name }}
148
+ {%- endmacro -%}
149
+
150
+ {%- macro render_builtin_tools(browser_tool, python_tool) -%}
151
+ {%- if browser_tool %}
152
+ {{- "## browser\n\n" }}
153
+ {{- "// Tool for browsing.\n" }}
154
+ {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }}
155
+ {{- "// Cite information from the tool using the following format:\n" }}
156
+ {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }}
157
+ {{- "// Do not quote more than 10 words directly from the tool output.\n" }}
158
+ {{- "// sources=web (default: web)\n" }}
159
+ {{- "namespace browser {\n\n" }}
160
+ {{- "// Searches for information related to `query` and displays `topn` results.\n" }}
161
+ {{- "type search = (_: {\n" }}
162
+ {{- "query: string,\n" }}
163
+ {{- "topn?: number, // default: 10\n" }}
164
+ {{- "source?: string,\n" }}
165
+ {{- "}) => any;\n\n" }}
166
+ {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }}
167
+ {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }}
168
+ {{- "// If `cursor` is not provided, the most recent page is implied.\n" }}
169
+ {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }}
170
+ {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }}
171
+ {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }}
172
+ {{- "type open = (_: {\n" }}
173
+ {{- "id?: number | string, // default: -1\n" }}
174
+ {{- "cursor?: number, // default: -1\n" }}
175
+ {{- "loc?: number, // default: -1\n" }}
176
+ {{- "num_lines?: number, // default: -1\n" }}
177
+ {{- "view_source?: boolean, // default: false\n" }}
178
+ {{- "source?: string,\n" }}
179
+ {{- "}) => any;\n\n" }}
180
+ {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }}
181
+ {{- "type find = (_: {\n" }}
182
+ {{- "pattern: string,\n" }}
183
+ {{- "cursor?: number, // default: -1\n" }}
184
+ {{- "}) => any;\n\n" }}
185
+ {{- "} // namespace browser\n\n" }}
186
+ {%- endif -%}
187
+
188
+ {%- if python_tool %}
189
+ {{- "## python\n\n" }}
190
+ {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }}
191
+ {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }}
192
+ {%- endif -%}
193
+ {%- endmacro -%}
194
+
195
+ {#- System Message Construction ============================================ #}
196
+ {%- macro build_system_message() -%}
197
+ {%- if model_identity is not defined %}
198
+ {{- "You are ChatGPT, a large language model trained by OpenAI.\n" -}}
199
+ {%- else %}
200
+ {{- model_identity }}
201
+ {%- endif %}
202
+ {{- "Knowledge cutoff: 2024-06\n" }}
203
+ {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }}
204
+ {%- if reasoning_effort is not defined %}
205
+ {%- set reasoning_effort = "medium" %}
206
+ {%- endif %}
207
+ {{- "Reasoning: " + reasoning_effort + "\n\n" }}
208
+ {%- if builtin_tools is defined %}
209
+ {{- "# Tools\n\n" }}
210
+ {%- set available_builtin_tools = namespace(browser=false, python=false) %}
211
+ {%- for tool in builtin_tools %}
212
+ {%- if tool == "browser" %}
213
+ {%- set available_builtin_tools.browser = true %}
214
+ {%- elif tool == "python" %}
215
+ {%- set available_builtin_tools.python = true %}
216
+ {%- endif %}
217
+ {%- endfor %}
218
+ {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }}
219
+ {%- endif -%}
220
+ {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }}
221
+ {%- if tools is defined -%}
222
+ {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }}
223
+ {%- endif -%}
224
+ {%- endmacro -%}
225
+
226
+ {#- Main Template Logic ================================================= #}
227
+ {#- Set defaults #}
228
+
229
+ {#- Render system message #}
230
+ {{- "<|start|>system<|message|>" }}
231
+ {{- build_system_message() }}
232
+ {{- "<|end|>" }}
233
+
234
+ {#- Extract developer message #}
235
+ {%- if messages[0].role == "developer" or messages[0].role == "system" %}
236
+ {%- set developer_message = messages[0].content %}
237
+ {%- set loop_messages = messages[1:] %}
238
+ {%- else %}
239
+ {%- set developer_message = "" %}
240
+ {%- set loop_messages = messages %}
241
+ {%- endif %}
242
+
243
+ {#- Render developer message #}
244
+ {%- if developer_message or tools %}
245
+ {{- "<|start|>developer<|message|>" }}
246
+ {%- if developer_message %}
247
+ {{- "# Instructions\n\n" }}
248
+ {{- developer_message }}
249
+ {%- endif %}
250
+ {%- if tools -%}
251
+ {{- "\n\n" }}
252
+ {{- "# Tools\n\n" }}
253
+ {{- render_tool_namespace("functions", tools) }}
254
+ {%- endif -%}
255
+ {{- "<|end|>" }}
256
+ {%- endif %}
257
+
258
+ {#- Render messages #}
259
+ {%- set last_tool_call = namespace(name=none) %}
260
+ {%- for message in loop_messages -%}
261
+ {#- At this point only assistant/user/tool messages should remain #}
262
+ {%- if message.role == 'assistant' -%}
263
+ {%- if "tool_calls" in message %}
264
+ {#- We assume max 1 tool call per message, and so we infer the tool call name #}
265
+ {#- in "tool" messages from the most recent assistant tool call name #}
266
+ {%- set tool_call = message.tool_calls[0] %}
267
+ {%- if tool_call.function %}
268
+ {%- set tool_call = tool_call.function %}
269
+ {%- endif %}
270
+ {%- if message.content %}
271
+ {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }}
272
+ {%- endif %}
273
+ {{- "<|start|>assistant to=" }}
274
+ {{- "functions." + tool_call.name + "<|channel|>commentary json<|message|>" }}
275
+ {{- tool_call.arguments|tojson }}
276
+ {{- "<|call|>" }}
277
+ {%- set last_tool_call.name = tool_call.name %}
278
+ {%- elif "thinking" in message and loop.last and not add_generation_prompt %}
279
+ {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #}
280
+ {#- This is a situation that should only occur in training, never in inference. #}
281
+ {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }}
282
+ {#- <|return|> indicates the end of generation, but <|end|> does not #}
283
+ {#- <|return|> should never be an input to the model, but we include it as the final token #}
284
+ {#- when training, so the model learns to emit it. #}
285
+ {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }}
286
+ {%- set last_tool_call.name = none %}
287
+ {%- elif "thinking" in message %}
288
+ {#- CoT is dropped during all previous turns, so we never render it for inference #}
289
+ {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }}
290
+ {%- set last_tool_call.name = none %}
291
+ {%- elif loop.last and not add_generation_prompt %}
292
+ {#- <|return|> indicates the end of generation, but <|end|> does not #}
293
+ {#- <|return|> should never be an input to the model, but we include it as the final token #}
294
+ {#- when training, so the model learns to emit it. #}
295
+ {{- "<|start|>assistant<|message|>" + message.content + "<|return|>" }}
296
+ {%- else %}
297
+ {{- "<|start|>assistant<|message|>" + message.content + "<|end|>" }}
298
+ {%- set last_tool_call.name = none %}
299
+ {%- endif %}
300
+ {%- elif message.role == 'tool' -%}
301
+ {%- if last_tool_call.name is none %}
302
+ {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
303
+ {%- endif %}
304
+ {{- "<|start|>functions." + last_tool_call.name }}
305
+ {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }}
306
+ {%- else -%}
307
+ {{- "<|start|>user<|message|>" + message.content + "<|end|>" }}
308
+ {%- endif -%}
309
+ {%- endfor -%}
310
+
311
+ {#- Generation prompt #}
312
+ {%- if add_generation_prompt -%}
313
+ <|start|>assistant
314
+ {%- endif -%}
315
+ {# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #}
checkpoint-3200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cab6e6c5bf7b3d1fb54e3a91db93ab9e2244d23f07b44d5aae498912c579dfb
3
+ size 64923339
checkpoint-3200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:701feb11d69a84a2da7e4619d6510f13d556a262e74da448c7c64cdb92819be7
3
+ size 14645
checkpoint-3200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:278950bc5d61de4ff926e42d8fcebbede7de6e32fdc8d51dc7b9f27ebafa0df9
3
+ size 1465
checkpoint-3200/special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|return|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|reserved_200017|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
checkpoint-3200/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d54615cba5113384c7495974d13feabc433e3e27e9262ac6a1a77f762a48d1c8
3
+ size 27868273
checkpoint-3200/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd9f6c1cec89c00a50e0eea8e27f8feadc89a155d49368d7a76e7de1f462cff5
3
+ size 4229
checkpoint-3200/trainer_state.json ADDED
@@ -0,0 +1,2274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.9829519275072953,
6
+ "eval_steps": 500,
7
+ "global_step": 3200,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.003071724773460298,
14
+ "grad_norm": 8.419622421264648,
15
+ "learning_rate": 1.8367346938775512e-06,
16
+ "loss": 7.0015,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.006143449546920596,
21
+ "grad_norm": 10.31521224975586,
22
+ "learning_rate": 3.877551020408164e-06,
23
+ "loss": 6.9781,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.009215174320380893,
28
+ "grad_norm": 11.546846389770508,
29
+ "learning_rate": 5.918367346938776e-06,
30
+ "loss": 6.8691,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.012286899093841192,
35
+ "grad_norm": 6.222589015960693,
36
+ "learning_rate": 7.959183673469388e-06,
37
+ "loss": 6.2584,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.01535862386730149,
42
+ "grad_norm": 5.404016971588135,
43
+ "learning_rate": 1e-05,
44
+ "loss": 5.5826,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.018430348640761787,
49
+ "grad_norm": 3.200059652328491,
50
+ "learning_rate": 1.2040816326530614e-05,
51
+ "loss": 4.7731,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.021502073414222087,
56
+ "grad_norm": 2.567652463912964,
57
+ "learning_rate": 1.4081632653061225e-05,
58
+ "loss": 4.1807,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.024573798187682384,
63
+ "grad_norm": 2.171948194503784,
64
+ "learning_rate": 1.612244897959184e-05,
65
+ "loss": 3.6721,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.02764552296114268,
70
+ "grad_norm": 1.7318979501724243,
71
+ "learning_rate": 1.816326530612245e-05,
72
+ "loss": 3.2164,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.03071724773460298,
77
+ "grad_norm": 1.588425636291504,
78
+ "learning_rate": 1.9999995051820308e-05,
79
+ "loss": 2.8804,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.03378897250806328,
84
+ "grad_norm": 1.8087046146392822,
85
+ "learning_rate": 1.9999401276182468e-05,
86
+ "loss": 2.4881,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.03686069728152357,
91
+ "grad_norm": 1.8364018201828003,
92
+ "learning_rate": 1.999781793193742e-05,
93
+ "loss": 2.0435,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.039932422054983874,
98
+ "grad_norm": 1.41903817653656,
99
+ "learning_rate": 1.9995245175777322e-05,
100
+ "loss": 1.8227,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.043004146828444174,
105
+ "grad_norm": 1.4173569679260254,
106
+ "learning_rate": 1.9991683262309292e-05,
107
+ "loss": 1.5215,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.04607587160190447,
112
+ "grad_norm": 1.2530128955841064,
113
+ "learning_rate": 1.998713254403021e-05,
114
+ "loss": 1.4296,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.04914759637536477,
119
+ "grad_norm": 0.9256352782249451,
120
+ "learning_rate": 1.9981593471291828e-05,
121
+ "loss": 1.3521,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.05221932114882506,
126
+ "grad_norm": 1.2156378030776978,
127
+ "learning_rate": 1.9975066592256226e-05,
128
+ "loss": 1.2796,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.05529104592228536,
133
+ "grad_norm": 0.7624578475952148,
134
+ "learning_rate": 1.996755255284153e-05,
135
+ "loss": 1.2564,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.058362770695745664,
140
+ "grad_norm": 0.8461359739303589,
141
+ "learning_rate": 1.9959052096658015e-05,
142
+ "loss": 1.2598,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.06143449546920596,
147
+ "grad_norm": 0.7803846597671509,
148
+ "learning_rate": 1.99495660649345e-05,
149
+ "loss": 1.292,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.06450622024266625,
154
+ "grad_norm": 0.8108134865760803,
155
+ "learning_rate": 1.9939095396435123e-05,
156
+ "loss": 1.274,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.06757794501612656,
161
+ "grad_norm": 0.6554253697395325,
162
+ "learning_rate": 1.99276411273664e-05,
163
+ "loss": 1.1686,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.07064966978958685,
168
+ "grad_norm": 0.7851625084877014,
169
+ "learning_rate": 1.991520439127471e-05,
170
+ "loss": 1.2176,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.07372139456304715,
175
+ "grad_norm": 0.933299720287323,
176
+ "learning_rate": 1.9901786418934105e-05,
177
+ "loss": 1.2398,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.07679311933650745,
182
+ "grad_norm": 0.8452703356742859,
183
+ "learning_rate": 1.9887388538224504e-05,
184
+ "loss": 1.2785,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.07986484410996775,
189
+ "grad_norm": 0.7061429619789124,
190
+ "learning_rate": 1.9872012174000298e-05,
191
+ "loss": 1.1552,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.08293656888342804,
196
+ "grad_norm": 0.6826983690261841,
197
+ "learning_rate": 1.9855658847949324e-05,
198
+ "loss": 1.0678,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.08600829365688835,
203
+ "grad_norm": 0.7688930630683899,
204
+ "learning_rate": 1.9838330178442288e-05,
205
+ "loss": 1.2263,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.08908001843034864,
210
+ "grad_norm": 0.725660502910614,
211
+ "learning_rate": 1.9820027880372598e-05,
212
+ "loss": 1.1576,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.09215174320380894,
217
+ "grad_norm": 0.8614407181739807,
218
+ "learning_rate": 1.980075376498666e-05,
219
+ "loss": 1.1923,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.09522346797726923,
224
+ "grad_norm": 0.7618085145950317,
225
+ "learning_rate": 1.9780509739704623e-05,
226
+ "loss": 1.2018,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.09829519275072954,
231
+ "grad_norm": 0.8258978724479675,
232
+ "learning_rate": 1.9759297807931634e-05,
233
+ "loss": 1.1869,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.10136691752418983,
238
+ "grad_norm": 0.8120176196098328,
239
+ "learning_rate": 1.9737120068859546e-05,
240
+ "loss": 1.1871,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.10443864229765012,
245
+ "grad_norm": 0.6733288168907166,
246
+ "learning_rate": 1.9713978717259207e-05,
247
+ "loss": 1.1853,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.10751036707111043,
252
+ "grad_norm": 0.7299067974090576,
253
+ "learning_rate": 1.9689876043263238e-05,
254
+ "loss": 1.1829,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.11058209184457073,
259
+ "grad_norm": 0.8712752461433411,
260
+ "learning_rate": 1.9664814432139408e-05,
261
+ "loss": 1.1995,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.11365381661803102,
266
+ "grad_norm": 0.9290866255760193,
267
+ "learning_rate": 1.9638796364054566e-05,
268
+ "loss": 1.1771,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.11672554139149133,
273
+ "grad_norm": 0.7293970584869385,
274
+ "learning_rate": 1.9611824413829215e-05,
275
+ "loss": 1.134,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.11979726616495162,
280
+ "grad_norm": 1.0634404420852661,
281
+ "learning_rate": 1.9583901250682687e-05,
282
+ "loss": 1.1265,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.12286899093841191,
287
+ "grad_norm": 0.9366344213485718,
288
+ "learning_rate": 1.9555029637969005e-05,
289
+ "loss": 1.1802,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.1259407157118722,
294
+ "grad_norm": 0.8466936945915222,
295
+ "learning_rate": 1.9525212432903388e-05,
296
+ "loss": 1.2034,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.1290124404853325,
301
+ "grad_norm": 0.7974659204483032,
302
+ "learning_rate": 1.9494452586279516e-05,
303
+ "loss": 1.1439,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.13208416525879282,
308
+ "grad_norm": 1.080421805381775,
309
+ "learning_rate": 1.9462753142177507e-05,
310
+ "loss": 1.12,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.13515589003225312,
315
+ "grad_norm": 0.9817642569541931,
316
+ "learning_rate": 1.9430117237662654e-05,
317
+ "loss": 1.1423,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.1382276148057134,
322
+ "grad_norm": 0.8396655917167664,
323
+ "learning_rate": 1.9396548102474992e-05,
324
+ "loss": 1.146,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.1412993395791737,
329
+ "grad_norm": 0.8537242412567139,
330
+ "learning_rate": 1.936204905870966e-05,
331
+ "loss": 1.0609,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.144371064352634,
336
+ "grad_norm": 0.7413896918296814,
337
+ "learning_rate": 1.932662352048813e-05,
338
+ "loss": 1.1508,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.1474427891260943,
343
+ "grad_norm": 0.9202362895011902,
344
+ "learning_rate": 1.929027499362036e-05,
345
+ "loss": 1.1131,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.1505145138995546,
350
+ "grad_norm": 1.1642792224884033,
351
+ "learning_rate": 1.9253007075257833e-05,
352
+ "loss": 1.1172,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.1535862386730149,
357
+ "grad_norm": 0.8923482894897461,
358
+ "learning_rate": 1.9214823453537568e-05,
359
+ "loss": 1.1486,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.1566579634464752,
364
+ "grad_norm": 0.8619294166564941,
365
+ "learning_rate": 1.9175727907217153e-05,
366
+ "loss": 1.0841,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.1597296882199355,
371
+ "grad_norm": 0.8567745089530945,
372
+ "learning_rate": 1.9135724305300757e-05,
373
+ "loss": 1.2078,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.1628014129933958,
378
+ "grad_norm": 0.924022912979126,
379
+ "learning_rate": 1.9094816606656272e-05,
380
+ "loss": 1.0892,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.16587313776685608,
385
+ "grad_norm": 1.2680057287216187,
386
+ "learning_rate": 1.9053008859623527e-05,
387
+ "loss": 1.1733,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.16894486254031638,
392
+ "grad_norm": 0.9168750643730164,
393
+ "learning_rate": 1.9010305201613625e-05,
394
+ "loss": 1.0919,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.1720165873137767,
399
+ "grad_norm": 1.0453463792800903,
400
+ "learning_rate": 1.8966709858699542e-05,
401
+ "loss": 1.1425,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.175088312087237,
406
+ "grad_norm": 0.9679042100906372,
407
+ "learning_rate": 1.8922227145197856e-05,
408
+ "loss": 1.1223,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.17816003686069729,
413
+ "grad_norm": 1.0852516889572144,
414
+ "learning_rate": 1.887686146324182e-05,
415
+ "loss": 1.09,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.18123176163415758,
420
+ "grad_norm": 1.0620839595794678,
421
+ "learning_rate": 1.8830617302345706e-05,
422
+ "loss": 1.1012,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.18430348640761787,
427
+ "grad_norm": 0.8156954646110535,
428
+ "learning_rate": 1.8783499238960495e-05,
429
+ "loss": 1.0889,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 0.18737521118107817,
434
+ "grad_norm": 1.028428077697754,
435
+ "learning_rate": 1.8735511936021016e-05,
436
+ "loss": 1.1586,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.19044693595453846,
441
+ "grad_norm": 0.893195629119873,
442
+ "learning_rate": 1.8686660142484446e-05,
443
+ "loss": 1.0948,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 0.19351866072799878,
448
+ "grad_norm": 1.1205710172653198,
449
+ "learning_rate": 1.8636948692860373e-05,
450
+ "loss": 1.141,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 0.19659038550145908,
455
+ "grad_norm": 0.8362760543823242,
456
+ "learning_rate": 1.8586382506732334e-05,
457
+ "loss": 1.0666,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 0.19966211027491937,
462
+ "grad_norm": 0.8918741345405579,
463
+ "learning_rate": 1.8534966588270987e-05,
464
+ "loss": 1.0774,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 0.20273383504837966,
469
+ "grad_norm": 1.0254731178283691,
470
+ "learning_rate": 1.8482706025738856e-05,
471
+ "loss": 1.0671,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 0.20580555982183996,
476
+ "grad_norm": 0.9713171720504761,
477
+ "learning_rate": 1.8429605990986797e-05,
478
+ "loss": 1.1728,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 0.20887728459530025,
483
+ "grad_norm": 1.1824350357055664,
484
+ "learning_rate": 1.8375671738942183e-05,
485
+ "loss": 1.0991,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 0.21194900936876057,
490
+ "grad_norm": 0.9485971331596375,
491
+ "learning_rate": 1.8320908607088847e-05,
492
+ "loss": 1.1219,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 0.21502073414222087,
497
+ "grad_norm": 0.9417327642440796,
498
+ "learning_rate": 1.8265322014938883e-05,
499
+ "loss": 1.093,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 0.21809245891568116,
504
+ "grad_norm": 1.0645897388458252,
505
+ "learning_rate": 1.82089174634963e-05,
506
+ "loss": 1.0401,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 0.22116418368914145,
511
+ "grad_norm": 1.0113904476165771,
512
+ "learning_rate": 1.815170053471265e-05,
513
+ "loss": 1.0896,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 0.22423590846260175,
518
+ "grad_norm": 0.988387405872345,
519
+ "learning_rate": 1.80936768909346e-05,
520
+ "loss": 1.1149,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 0.22730763323606204,
525
+ "grad_norm": 1.0920623540878296,
526
+ "learning_rate": 1.8034852274343585e-05,
527
+ "loss": 1.1012,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 0.23037935800952233,
532
+ "grad_norm": 1.0729286670684814,
533
+ "learning_rate": 1.797523250638754e-05,
534
+ "loss": 1.1034,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 0.23345108278298266,
539
+ "grad_norm": 1.054861068725586,
540
+ "learning_rate": 1.7914823487204796e-05,
541
+ "loss": 1.1171,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 0.23652280755644295,
546
+ "grad_norm": 1.0314829349517822,
547
+ "learning_rate": 1.7853631195040178e-05,
548
+ "loss": 1.0606,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 0.23959453232990324,
553
+ "grad_norm": 1.354514718055725,
554
+ "learning_rate": 1.7791661685653395e-05,
555
+ "loss": 1.0771,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 0.24266625710336354,
560
+ "grad_norm": 1.1574825048446655,
561
+ "learning_rate": 1.7728921091719733e-05,
562
+ "loss": 1.0866,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 0.24573798187682383,
567
+ "grad_norm": 1.0411062240600586,
568
+ "learning_rate": 1.7665415622223155e-05,
569
+ "loss": 1.0832,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 0.24880970665028412,
574
+ "grad_norm": 1.0036734342575073,
575
+ "learning_rate": 1.760115156184184e-05,
576
+ "loss": 1.0529,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 0.2518814314237444,
581
+ "grad_norm": 1.2075008153915405,
582
+ "learning_rate": 1.753613527032623e-05,
583
+ "loss": 1.1115,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 0.2549531561972047,
588
+ "grad_norm": 1.1035667657852173,
589
+ "learning_rate": 1.7470373181869667e-05,
590
+ "loss": 1.0905,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 0.258024880970665,
595
+ "grad_norm": 1.0951581001281738,
596
+ "learning_rate": 1.740387180447162e-05,
597
+ "loss": 1.0305,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 0.26109660574412535,
602
+ "grad_norm": 1.2153475284576416,
603
+ "learning_rate": 1.7336637719293667e-05,
604
+ "loss": 1.0678,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 0.26416833051758565,
609
+ "grad_norm": 1.0323575735092163,
610
+ "learning_rate": 1.726867758000818e-05,
611
+ "loss": 1.041,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 0.26724005529104594,
616
+ "grad_norm": 1.111869215965271,
617
+ "learning_rate": 1.7199998112139863e-05,
618
+ "loss": 1.0982,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 0.27031178006450624,
623
+ "grad_norm": 1.1253854036331177,
624
+ "learning_rate": 1.71306061124002e-05,
625
+ "loss": 1.1006,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 0.27338350483796653,
630
+ "grad_norm": 1.165722131729126,
631
+ "learning_rate": 1.706050844801479e-05,
632
+ "loss": 1.0595,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 0.2764552296114268,
637
+ "grad_norm": 1.0169954299926758,
638
+ "learning_rate": 1.6989712056043786e-05,
639
+ "loss": 1.0459,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 0.2795269543848871,
644
+ "grad_norm": 1.1502857208251953,
645
+ "learning_rate": 1.6918223942695374e-05,
646
+ "loss": 1.0645,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 0.2825986791583474,
651
+ "grad_norm": 0.9404005408287048,
652
+ "learning_rate": 1.6846051182632396e-05,
653
+ "loss": 1.0838,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 0.2856704039318077,
658
+ "grad_norm": 0.9998802542686462,
659
+ "learning_rate": 1.6773200918272257e-05,
660
+ "loss": 1.0573,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 0.288742128705268,
665
+ "grad_norm": 1.1572364568710327,
666
+ "learning_rate": 1.6699680359080066e-05,
667
+ "loss": 1.0408,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 0.2918138534787283,
672
+ "grad_norm": 1.0092674493789673,
673
+ "learning_rate": 1.662549678085518e-05,
674
+ "loss": 1.0499,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 0.2948855782521886,
679
+ "grad_norm": 1.1230064630508423,
680
+ "learning_rate": 1.6550657525011163e-05,
681
+ "loss": 1.0278,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 0.2979573030256489,
686
+ "grad_norm": 1.0530235767364502,
687
+ "learning_rate": 1.6475169997849267e-05,
688
+ "loss": 1.0977,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 0.3010290277991092,
693
+ "grad_norm": 1.2637819051742554,
694
+ "learning_rate": 1.6399041669825478e-05,
695
+ "loss": 1.0096,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 0.3041007525725695,
700
+ "grad_norm": 1.0472123622894287,
701
+ "learning_rate": 1.632228007481122e-05,
702
+ "loss": 1.0773,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 0.3071724773460298,
707
+ "grad_norm": 1.0698893070220947,
708
+ "learning_rate": 1.624489280934778e-05,
709
+ "loss": 1.0724,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 0.3102442021194901,
714
+ "grad_norm": 1.4538482427597046,
715
+ "learning_rate": 1.616688753189454e-05,
716
+ "loss": 1.0787,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 0.3133159268929504,
721
+ "grad_norm": 1.0167417526245117,
722
+ "learning_rate": 1.6088271962071067e-05,
723
+ "loss": 1.1064,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 0.3163876516664107,
728
+ "grad_norm": 1.1629698276519775,
729
+ "learning_rate": 1.6009053879893164e-05,
730
+ "loss": 1.0071,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 0.319459376439871,
735
+ "grad_norm": 1.2197659015655518,
736
+ "learning_rate": 1.5929241125002936e-05,
737
+ "loss": 1.0701,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 0.3225311012133313,
742
+ "grad_norm": 1.2297234535217285,
743
+ "learning_rate": 1.584884159589295e-05,
744
+ "loss": 1.0286,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 0.3256028259867916,
749
+ "grad_norm": 1.1668790578842163,
750
+ "learning_rate": 1.5767863249124588e-05,
751
+ "loss": 1.0598,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 0.32867455076025187,
756
+ "grad_norm": 1.219712257385254,
757
+ "learning_rate": 1.5686314098540643e-05,
758
+ "loss": 1.0586,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 0.33174627553371216,
763
+ "grad_norm": 1.072532057762146,
764
+ "learning_rate": 1.560420221447224e-05,
765
+ "loss": 1.0879,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 0.33481800030717246,
770
+ "grad_norm": 1.2042076587677002,
771
+ "learning_rate": 1.552153572294018e-05,
772
+ "loss": 1.0622,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 0.33788972508063275,
777
+ "grad_norm": 1.2498126029968262,
778
+ "learning_rate": 1.5438322804850762e-05,
779
+ "loss": 1.0422,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 0.34096144985409305,
784
+ "grad_norm": 1.2515864372253418,
785
+ "learning_rate": 1.5354571695186175e-05,
786
+ "loss": 1.0303,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 0.3440331746275534,
791
+ "grad_norm": 1.1558114290237427,
792
+ "learning_rate": 1.5270290682189556e-05,
793
+ "loss": 0.9732,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 0.3471048994010137,
798
+ "grad_norm": 1.3687546253204346,
799
+ "learning_rate": 1.5185488106544743e-05,
800
+ "loss": 1.0289,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 0.350176624174474,
805
+ "grad_norm": 1.5358903408050537,
806
+ "learning_rate": 1.5100172360550873e-05,
807
+ "loss": 1.0093,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 0.3532483489479343,
812
+ "grad_norm": 2.732478380203247,
813
+ "learning_rate": 1.5014351887291843e-05,
814
+ "loss": 1.0015,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 0.35632007372139457,
819
+ "grad_norm": 2.0520331859588623,
820
+ "learning_rate": 1.4928035179800772e-05,
821
+ "loss": 0.9988,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 0.35939179849485486,
826
+ "grad_norm": 1.1731293201446533,
827
+ "learning_rate": 1.48412307802195e-05,
828
+ "loss": 0.9697,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 0.36246352326831516,
833
+ "grad_norm": 1.8224886655807495,
834
+ "learning_rate": 1.4753947278953235e-05,
835
+ "loss": 1.0542,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 0.36553524804177545,
840
+ "grad_norm": 1.2448983192443848,
841
+ "learning_rate": 1.4666193313820418e-05,
842
+ "loss": 1.028,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 0.36860697281523574,
847
+ "grad_norm": 1.2823021411895752,
848
+ "learning_rate": 1.4577977569197915e-05,
849
+ "loss": 1.0719,
850
+ "step": 1200
851
+ },
852
+ {
853
+ "epoch": 0.37167869758869604,
854
+ "grad_norm": 1.0568265914916992,
855
+ "learning_rate": 1.4489308775161564e-05,
856
+ "loss": 1.0142,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 0.37475042236215633,
861
+ "grad_norm": 1.0901318788528442,
862
+ "learning_rate": 1.4400195706622247e-05,
863
+ "loss": 1.046,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 0.3778221471356166,
868
+ "grad_norm": 1.1522573232650757,
869
+ "learning_rate": 1.4310647182457475e-05,
870
+ "loss": 1.0342,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 0.3808938719090769,
875
+ "grad_norm": 1.0686454772949219,
876
+ "learning_rate": 1.422067206463868e-05,
877
+ "loss": 1.0486,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 0.38396559668253727,
882
+ "grad_norm": 1.1028980016708374,
883
+ "learning_rate": 1.413027925735417e-05,
884
+ "loss": 1.0219,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 0.38703732145599756,
889
+ "grad_norm": 1.297943353652954,
890
+ "learning_rate": 1.4039477706127982e-05,
891
+ "loss": 1.0349,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 0.39010904622945786,
896
+ "grad_norm": 1.081758737564087,
897
+ "learning_rate": 1.3948276396934578e-05,
898
+ "loss": 1.0313,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 0.39318077100291815,
903
+ "grad_norm": 1.3776434659957886,
904
+ "learning_rate": 1.3856684355309598e-05,
905
+ "loss": 0.9959,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 0.39625249577637844,
910
+ "grad_norm": 1.3725264072418213,
911
+ "learning_rate": 1.3764710645456639e-05,
912
+ "loss": 1.0351,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 0.39932422054983874,
917
+ "grad_norm": 1.3018957376480103,
918
+ "learning_rate": 1.3672364369350254e-05,
919
+ "loss": 1.0865,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 0.40239594532329903,
924
+ "grad_norm": 1.0644463300704956,
925
+ "learning_rate": 1.357965466583519e-05,
926
+ "loss": 1.026,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 0.4054676700967593,
931
+ "grad_norm": 1.2380294799804688,
932
+ "learning_rate": 1.348659070972199e-05,
933
+ "loss": 1.0132,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 0.4085393948702196,
938
+ "grad_norm": 1.39878249168396,
939
+ "learning_rate": 1.3393181710879002e-05,
940
+ "loss": 1.0792,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 0.4116111196436799,
945
+ "grad_norm": 1.2087831497192383,
946
+ "learning_rate": 1.3299436913320982e-05,
947
+ "loss": 1.0377,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 0.4146828444171402,
952
+ "grad_norm": 1.3151507377624512,
953
+ "learning_rate": 1.3205365594294257e-05,
954
+ "loss": 1.0384,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 0.4177545691906005,
959
+ "grad_norm": 1.2046235799789429,
960
+ "learning_rate": 1.3110977063358626e-05,
961
+ "loss": 0.996,
962
+ "step": 1360
963
+ },
964
+ {
965
+ "epoch": 0.4208262939640608,
966
+ "grad_norm": 1.3252286911010742,
967
+ "learning_rate": 1.3016280661466063e-05,
968
+ "loss": 1.0521,
969
+ "step": 1370
970
+ },
971
+ {
972
+ "epoch": 0.42389801873752114,
973
+ "grad_norm": 1.315597653388977,
974
+ "learning_rate": 1.292128576003631e-05,
975
+ "loss": 0.984,
976
+ "step": 1380
977
+ },
978
+ {
979
+ "epoch": 0.42696974351098144,
980
+ "grad_norm": 1.131601095199585,
981
+ "learning_rate": 1.2826001760029453e-05,
982
+ "loss": 1.064,
983
+ "step": 1390
984
+ },
985
+ {
986
+ "epoch": 0.43004146828444173,
987
+ "grad_norm": 1.1241368055343628,
988
+ "learning_rate": 1.2730438091015587e-05,
989
+ "loss": 1.0045,
990
+ "step": 1400
991
+ },
992
+ {
993
+ "epoch": 0.433113193057902,
994
+ "grad_norm": 1.1869995594024658,
995
+ "learning_rate": 1.263460421024162e-05,
996
+ "loss": 1.0165,
997
+ "step": 1410
998
+ },
999
+ {
1000
+ "epoch": 0.4361849178313623,
1001
+ "grad_norm": 1.1647521257400513,
1002
+ "learning_rate": 1.2538509601695372e-05,
1003
+ "loss": 0.9639,
1004
+ "step": 1420
1005
+ },
1006
+ {
1007
+ "epoch": 0.4392566426048226,
1008
+ "grad_norm": 1.3259695768356323,
1009
+ "learning_rate": 1.2442163775167009e-05,
1010
+ "loss": 1.0209,
1011
+ "step": 1430
1012
+ },
1013
+ {
1014
+ "epoch": 0.4423283673782829,
1015
+ "grad_norm": 1.061746597290039,
1016
+ "learning_rate": 1.2345576265307934e-05,
1017
+ "loss": 1.0044,
1018
+ "step": 1440
1019
+ },
1020
+ {
1021
+ "epoch": 0.4454000921517432,
1022
+ "grad_norm": 1.3238203525543213,
1023
+ "learning_rate": 1.2248756630687204e-05,
1024
+ "loss": 1.0122,
1025
+ "step": 1450
1026
+ },
1027
+ {
1028
+ "epoch": 0.4484718169252035,
1029
+ "grad_norm": 1.1721187829971313,
1030
+ "learning_rate": 1.2151714452845582e-05,
1031
+ "loss": 1.0814,
1032
+ "step": 1460
1033
+ },
1034
+ {
1035
+ "epoch": 0.4515435416986638,
1036
+ "grad_norm": 1.0964605808258057,
1037
+ "learning_rate": 1.2054459335347333e-05,
1038
+ "loss": 1.0629,
1039
+ "step": 1470
1040
+ },
1041
+ {
1042
+ "epoch": 0.4546152664721241,
1043
+ "grad_norm": 1.2595895528793335,
1044
+ "learning_rate": 1.1957000902829827e-05,
1045
+ "loss": 1.0172,
1046
+ "step": 1480
1047
+ },
1048
+ {
1049
+ "epoch": 0.4576869912455844,
1050
+ "grad_norm": 1.1863325834274292,
1051
+ "learning_rate": 1.1859348800051047e-05,
1052
+ "loss": 0.9903,
1053
+ "step": 1490
1054
+ },
1055
+ {
1056
+ "epoch": 0.46075871601904467,
1057
+ "grad_norm": 1.2627488374710083,
1058
+ "learning_rate": 1.1761512690935135e-05,
1059
+ "loss": 1.0419,
1060
+ "step": 1500
1061
+ },
1062
+ {
1063
+ "epoch": 0.463830440792505,
1064
+ "grad_norm": 1.2958710193634033,
1065
+ "learning_rate": 1.1663502257616002e-05,
1066
+ "loss": 0.9637,
1067
+ "step": 1510
1068
+ },
1069
+ {
1070
+ "epoch": 0.4669021655659653,
1071
+ "grad_norm": 1.5594003200531006,
1072
+ "learning_rate": 1.1565327199479173e-05,
1073
+ "loss": 0.9793,
1074
+ "step": 1520
1075
+ },
1076
+ {
1077
+ "epoch": 0.4699738903394256,
1078
+ "grad_norm": 1.2414544820785522,
1079
+ "learning_rate": 1.1466997232201901e-05,
1080
+ "loss": 1.0322,
1081
+ "step": 1530
1082
+ },
1083
+ {
1084
+ "epoch": 0.4730456151128859,
1085
+ "grad_norm": 1.5642577409744263,
1086
+ "learning_rate": 1.1368522086791688e-05,
1087
+ "loss": 1.0769,
1088
+ "step": 1540
1089
+ },
1090
+ {
1091
+ "epoch": 0.4761173398863462,
1092
+ "grad_norm": 1.3605339527130127,
1093
+ "learning_rate": 1.1269911508623255e-05,
1094
+ "loss": 0.9849,
1095
+ "step": 1550
1096
+ },
1097
+ {
1098
+ "epoch": 0.4791890646598065,
1099
+ "grad_norm": 1.3629966974258423,
1100
+ "learning_rate": 1.1171175256474137e-05,
1101
+ "loss": 1.0511,
1102
+ "step": 1560
1103
+ },
1104
+ {
1105
+ "epoch": 0.4822607894332668,
1106
+ "grad_norm": 1.4438694715499878,
1107
+ "learning_rate": 1.1072323101558908e-05,
1108
+ "loss": 0.9724,
1109
+ "step": 1570
1110
+ },
1111
+ {
1112
+ "epoch": 0.4853325142067271,
1113
+ "grad_norm": 1.2920620441436768,
1114
+ "learning_rate": 1.0973364826562208e-05,
1115
+ "loss": 1.0242,
1116
+ "step": 1580
1117
+ },
1118
+ {
1119
+ "epoch": 0.48840423898018737,
1120
+ "grad_norm": 1.1293901205062866,
1121
+ "learning_rate": 1.0874310224670615e-05,
1122
+ "loss": 0.9828,
1123
+ "step": 1590
1124
+ },
1125
+ {
1126
+ "epoch": 0.49147596375364766,
1127
+ "grad_norm": 1.3501414060592651,
1128
+ "learning_rate": 1.0775169098603487e-05,
1129
+ "loss": 1.0432,
1130
+ "step": 1600
1131
+ },
1132
+ {
1133
+ "epoch": 0.49454768852710795,
1134
+ "grad_norm": 1.3344357013702393,
1135
+ "learning_rate": 1.0675951259642846e-05,
1136
+ "loss": 1.0102,
1137
+ "step": 1610
1138
+ },
1139
+ {
1140
+ "epoch": 0.49761941330056825,
1141
+ "grad_norm": 1.354965090751648,
1142
+ "learning_rate": 1.0576666526662447e-05,
1143
+ "loss": 0.9579,
1144
+ "step": 1620
1145
+ },
1146
+ {
1147
+ "epoch": 0.5006911380740285,
1148
+ "grad_norm": 1.2013002634048462,
1149
+ "learning_rate": 1.0477324725156058e-05,
1150
+ "loss": 1.0215,
1151
+ "step": 1630
1152
+ },
1153
+ {
1154
+ "epoch": 0.5037628628474888,
1155
+ "grad_norm": 1.5074472427368164,
1156
+ "learning_rate": 1.037793568626511e-05,
1157
+ "loss": 1.0077,
1158
+ "step": 1640
1159
+ },
1160
+ {
1161
+ "epoch": 0.5068345876209491,
1162
+ "grad_norm": 1.3318787813186646,
1163
+ "learning_rate": 1.0278509245805774e-05,
1164
+ "loss": 1.0502,
1165
+ "step": 1650
1166
+ },
1167
+ {
1168
+ "epoch": 0.5099063123944094,
1169
+ "grad_norm": 1.3787329196929932,
1170
+ "learning_rate": 1.0179055243295587e-05,
1171
+ "loss": 1.0129,
1172
+ "step": 1660
1173
+ },
1174
+ {
1175
+ "epoch": 0.5129780371678697,
1176
+ "grad_norm": 1.1923975944519043,
1177
+ "learning_rate": 1.0079583520979694e-05,
1178
+ "loss": 1.0129,
1179
+ "step": 1670
1180
+ },
1181
+ {
1182
+ "epoch": 0.51604976194133,
1183
+ "grad_norm": 1.707875370979309,
1184
+ "learning_rate": 9.980103922856861e-06,
1185
+ "loss": 1.0259,
1186
+ "step": 1680
1187
+ },
1188
+ {
1189
+ "epoch": 0.5191214867147903,
1190
+ "grad_norm": 1.3130182027816772,
1191
+ "learning_rate": 9.880626293705255e-06,
1192
+ "loss": 0.9913,
1193
+ "step": 1690
1194
+ },
1195
+ {
1196
+ "epoch": 0.5221932114882507,
1197
+ "grad_norm": 2.6558756828308105,
1198
+ "learning_rate": 9.781160478108177e-06,
1199
+ "loss": 0.9719,
1200
+ "step": 1700
1201
+ },
1202
+ {
1203
+ "epoch": 0.525264936261711,
1204
+ "grad_norm": 1.4856926202774048,
1205
+ "learning_rate": 9.68171631947984e-06,
1206
+ "loss": 1.0207,
1207
+ "step": 1710
1208
+ },
1209
+ {
1210
+ "epoch": 0.5283366610351713,
1211
+ "grad_norm": 1.590119481086731,
1212
+ "learning_rate": 9.582303659091222e-06,
1213
+ "loss": 1.023,
1214
+ "step": 1720
1215
+ },
1216
+ {
1217
+ "epoch": 0.5314083858086316,
1218
+ "grad_norm": 1.4528511762619019,
1219
+ "learning_rate": 9.482932335096144e-06,
1220
+ "loss": 1.0007,
1221
+ "step": 1730
1222
+ },
1223
+ {
1224
+ "epoch": 0.5344801105820919,
1225
+ "grad_norm": 1.351442813873291,
1226
+ "learning_rate": 9.38361218155766e-06,
1227
+ "loss": 0.9612,
1228
+ "step": 1740
1229
+ },
1230
+ {
1231
+ "epoch": 0.5375518353555522,
1232
+ "grad_norm": 1.8360393047332764,
1233
+ "learning_rate": 9.28435302747486e-06,
1234
+ "loss": 1.0286,
1235
+ "step": 1750
1236
+ },
1237
+ {
1238
+ "epoch": 0.5406235601290125,
1239
+ "grad_norm": 1.3718026876449585,
1240
+ "learning_rate": 9.18516469581015e-06,
1241
+ "loss": 1.0251,
1242
+ "step": 1760
1243
+ },
1244
+ {
1245
+ "epoch": 0.5436952849024728,
1246
+ "grad_norm": 3.648052930831909,
1247
+ "learning_rate": 9.086057002517163e-06,
1248
+ "loss": 1.0119,
1249
+ "step": 1770
1250
+ },
1251
+ {
1252
+ "epoch": 0.5467670096759331,
1253
+ "grad_norm": 1.5079731941223145,
1254
+ "learning_rate": 8.98703975556932e-06,
1255
+ "loss": 1.0646,
1256
+ "step": 1780
1257
+ },
1258
+ {
1259
+ "epoch": 0.5498387344493934,
1260
+ "grad_norm": 1.3732868432998657,
1261
+ "learning_rate": 8.88812275398923e-06,
1262
+ "loss": 1.0119,
1263
+ "step": 1790
1264
+ },
1265
+ {
1266
+ "epoch": 0.5529104592228536,
1267
+ "grad_norm": 1.21127450466156,
1268
+ "learning_rate": 8.789315786878936e-06,
1269
+ "loss": 0.9586,
1270
+ "step": 1800
1271
+ },
1272
+ {
1273
+ "epoch": 0.5559821839963139,
1274
+ "grad_norm": 1.5817351341247559,
1275
+ "learning_rate": 8.69062863245117e-06,
1276
+ "loss": 1.0425,
1277
+ "step": 1810
1278
+ },
1279
+ {
1280
+ "epoch": 0.5590539087697742,
1281
+ "grad_norm": 1.689554214477539,
1282
+ "learning_rate": 8.59207105706166e-06,
1283
+ "loss": 1.0406,
1284
+ "step": 1820
1285
+ },
1286
+ {
1287
+ "epoch": 0.5621256335432345,
1288
+ "grad_norm": 1.2953405380249023,
1289
+ "learning_rate": 8.493652814242632e-06,
1290
+ "loss": 0.9987,
1291
+ "step": 1830
1292
+ },
1293
+ {
1294
+ "epoch": 0.5651973583166948,
1295
+ "grad_norm": 1.519486904144287,
1296
+ "learning_rate": 8.395383643737575e-06,
1297
+ "loss": 0.9818,
1298
+ "step": 1840
1299
+ },
1300
+ {
1301
+ "epoch": 0.5682690830901551,
1302
+ "grad_norm": 1.1716630458831787,
1303
+ "learning_rate": 8.297273270537372e-06,
1304
+ "loss": 0.9914,
1305
+ "step": 1850
1306
+ },
1307
+ {
1308
+ "epoch": 0.5713408078636154,
1309
+ "grad_norm": 1.412215232849121,
1310
+ "learning_rate": 8.199331403917869e-06,
1311
+ "loss": 0.9952,
1312
+ "step": 1860
1313
+ },
1314
+ {
1315
+ "epoch": 0.5744125326370757,
1316
+ "grad_norm": 1.7365607023239136,
1317
+ "learning_rate": 8.101567736479044e-06,
1318
+ "loss": 0.9934,
1319
+ "step": 1870
1320
+ },
1321
+ {
1322
+ "epoch": 0.577484257410536,
1323
+ "grad_norm": 1.4288475513458252,
1324
+ "learning_rate": 8.003991943185778e-06,
1325
+ "loss": 1.0203,
1326
+ "step": 1880
1327
+ },
1328
+ {
1329
+ "epoch": 0.5805559821839963,
1330
+ "grad_norm": 1.3489760160446167,
1331
+ "learning_rate": 7.906613680410415e-06,
1332
+ "loss": 0.9778,
1333
+ "step": 1890
1334
+ },
1335
+ {
1336
+ "epoch": 0.5836277069574566,
1337
+ "grad_norm": 1.46076238155365,
1338
+ "learning_rate": 7.809442584977113e-06,
1339
+ "loss": 0.9823,
1340
+ "step": 1900
1341
+ },
1342
+ {
1343
+ "epoch": 0.5866994317309169,
1344
+ "grad_norm": 1.5136983394622803,
1345
+ "learning_rate": 7.712488273208183e-06,
1346
+ "loss": 1.0759,
1347
+ "step": 1910
1348
+ },
1349
+ {
1350
+ "epoch": 0.5897711565043772,
1351
+ "grad_norm": 1.2395247220993042,
1352
+ "learning_rate": 7.615760339972421e-06,
1353
+ "loss": 0.9722,
1354
+ "step": 1920
1355
+ },
1356
+ {
1357
+ "epoch": 0.5928428812778375,
1358
+ "grad_norm": 1.1349307298660278,
1359
+ "learning_rate": 7.519268357735574e-06,
1360
+ "loss": 0.9651,
1361
+ "step": 1930
1362
+ },
1363
+ {
1364
+ "epoch": 0.5959146060512978,
1365
+ "grad_norm": 1.512542963027954,
1366
+ "learning_rate": 7.423021875613009e-06,
1367
+ "loss": 0.9941,
1368
+ "step": 1940
1369
+ },
1370
+ {
1371
+ "epoch": 0.598986330824758,
1372
+ "grad_norm": 1.5082643032073975,
1373
+ "learning_rate": 7.32703041842473e-06,
1374
+ "loss": 1.0251,
1375
+ "step": 1950
1376
+ },
1377
+ {
1378
+ "epoch": 0.6020580555982183,
1379
+ "grad_norm": 1.27542245388031,
1380
+ "learning_rate": 7.231303485752756e-06,
1381
+ "loss": 1.0445,
1382
+ "step": 1960
1383
+ },
1384
+ {
1385
+ "epoch": 0.6051297803716787,
1386
+ "grad_norm": 1.3753620386123657,
1387
+ "learning_rate": 7.135850551001034e-06,
1388
+ "loss": 0.9565,
1389
+ "step": 1970
1390
+ },
1391
+ {
1392
+ "epoch": 0.608201505145139,
1393
+ "grad_norm": 1.503854513168335,
1394
+ "learning_rate": 7.040681060457895e-06,
1395
+ "loss": 1.0124,
1396
+ "step": 1980
1397
+ },
1398
+ {
1399
+ "epoch": 0.6112732299185993,
1400
+ "grad_norm": 1.3910346031188965,
1401
+ "learning_rate": 6.9458044323612575e-06,
1402
+ "loss": 1.0008,
1403
+ "step": 1990
1404
+ },
1405
+ {
1406
+ "epoch": 0.6143449546920596,
1407
+ "grad_norm": 1.2628332376480103,
1408
+ "learning_rate": 6.851230055966549e-06,
1409
+ "loss": 0.9856,
1410
+ "step": 2000
1411
+ },
1412
+ {
1413
+ "epoch": 0.6174166794655199,
1414
+ "grad_norm": 1.8083914518356323,
1415
+ "learning_rate": 6.756967290617533e-06,
1416
+ "loss": 0.9701,
1417
+ "step": 2010
1418
+ },
1419
+ {
1420
+ "epoch": 0.6204884042389802,
1421
+ "grad_norm": 1.4478784799575806,
1422
+ "learning_rate": 6.6630254648200656e-06,
1423
+ "loss": 0.9418,
1424
+ "step": 2020
1425
+ },
1426
+ {
1427
+ "epoch": 0.6235601290124405,
1428
+ "grad_norm": 2.209411859512329,
1429
+ "learning_rate": 6.569413875318937e-06,
1430
+ "loss": 0.9841,
1431
+ "step": 2030
1432
+ },
1433
+ {
1434
+ "epoch": 0.6266318537859008,
1435
+ "grad_norm": 1.5672208070755005,
1436
+ "learning_rate": 6.4761417861778366e-06,
1437
+ "loss": 0.9717,
1438
+ "step": 2040
1439
+ },
1440
+ {
1441
+ "epoch": 0.6297035785593611,
1442
+ "grad_norm": 1.6222248077392578,
1443
+ "learning_rate": 6.383218427862544e-06,
1444
+ "loss": 0.9686,
1445
+ "step": 2050
1446
+ },
1447
+ {
1448
+ "epoch": 0.6327753033328214,
1449
+ "grad_norm": 1.7131073474884033,
1450
+ "learning_rate": 6.290652996327471e-06,
1451
+ "loss": 1.0376,
1452
+ "step": 2060
1453
+ },
1454
+ {
1455
+ "epoch": 0.6358470281062817,
1456
+ "grad_norm": 1.3580713272094727,
1457
+ "learning_rate": 6.198454652105599e-06,
1458
+ "loss": 0.9885,
1459
+ "step": 2070
1460
+ },
1461
+ {
1462
+ "epoch": 0.638918752879742,
1463
+ "grad_norm": 1.482251524925232,
1464
+ "learning_rate": 6.106632519401924e-06,
1465
+ "loss": 1.0063,
1466
+ "step": 2080
1467
+ },
1468
+ {
1469
+ "epoch": 0.6419904776532023,
1470
+ "grad_norm": 1.4394768476486206,
1471
+ "learning_rate": 6.015195685190496e-06,
1472
+ "loss": 1.0049,
1473
+ "step": 2090
1474
+ },
1475
+ {
1476
+ "epoch": 0.6450622024266626,
1477
+ "grad_norm": 1.397321105003357,
1478
+ "learning_rate": 5.9241531983151604e-06,
1479
+ "loss": 1.061,
1480
+ "step": 2100
1481
+ },
1482
+ {
1483
+ "epoch": 0.6481339272001229,
1484
+ "grad_norm": 1.4152517318725586,
1485
+ "learning_rate": 5.833514068594053e-06,
1486
+ "loss": 1.0047,
1487
+ "step": 2110
1488
+ },
1489
+ {
1490
+ "epoch": 0.6512056519735832,
1491
+ "grad_norm": 1.4583706855773926,
1492
+ "learning_rate": 5.743287265927959e-06,
1493
+ "loss": 0.9749,
1494
+ "step": 2120
1495
+ },
1496
+ {
1497
+ "epoch": 0.6542773767470434,
1498
+ "grad_norm": 1.5237219333648682,
1499
+ "learning_rate": 5.65348171941263e-06,
1500
+ "loss": 1.0137,
1501
+ "step": 2130
1502
+ },
1503
+ {
1504
+ "epoch": 0.6573491015205037,
1505
+ "grad_norm": 1.4176579713821411,
1506
+ "learning_rate": 5.564106316455127e-06,
1507
+ "loss": 0.9807,
1508
+ "step": 2140
1509
+ },
1510
+ {
1511
+ "epoch": 0.660420826293964,
1512
+ "grad_norm": 1.3908677101135254,
1513
+ "learning_rate": 5.475169901894324e-06,
1514
+ "loss": 0.9511,
1515
+ "step": 2150
1516
+ },
1517
+ {
1518
+ "epoch": 0.6634925510674243,
1519
+ "grad_norm": 1.3458560705184937,
1520
+ "learning_rate": 5.386681277125565e-06,
1521
+ "loss": 1.0058,
1522
+ "step": 2160
1523
+ },
1524
+ {
1525
+ "epoch": 0.6665642758408846,
1526
+ "grad_norm": 1.272244930267334,
1527
+ "learning_rate": 5.298649199229671e-06,
1528
+ "loss": 0.9592,
1529
+ "step": 2170
1530
+ },
1531
+ {
1532
+ "epoch": 0.6696360006143449,
1533
+ "grad_norm": 1.8487739562988281,
1534
+ "learning_rate": 5.211082380106323e-06,
1535
+ "loss": 1.0428,
1536
+ "step": 2180
1537
+ },
1538
+ {
1539
+ "epoch": 0.6727077253878052,
1540
+ "grad_norm": 1.4018948078155518,
1541
+ "learning_rate": 5.123989485611881e-06,
1542
+ "loss": 1.0358,
1543
+ "step": 2190
1544
+ },
1545
+ {
1546
+ "epoch": 0.6757794501612655,
1547
+ "grad_norm": 2.0736992359161377,
1548
+ "learning_rate": 5.037379134701827e-06,
1549
+ "loss": 1.015,
1550
+ "step": 2200
1551
+ },
1552
+ {
1553
+ "epoch": 0.6788511749347258,
1554
+ "grad_norm": 1.3019697666168213,
1555
+ "learning_rate": 4.951259898577754e-06,
1556
+ "loss": 0.9984,
1557
+ "step": 2210
1558
+ },
1559
+ {
1560
+ "epoch": 0.6819228997081861,
1561
+ "grad_norm": 1.2767648696899414,
1562
+ "learning_rate": 4.865640299839193e-06,
1563
+ "loss": 1.0063,
1564
+ "step": 2220
1565
+ },
1566
+ {
1567
+ "epoch": 0.6849946244816465,
1568
+ "grad_norm": 1.8070132732391357,
1569
+ "learning_rate": 4.780528811640162e-06,
1570
+ "loss": 1.0371,
1571
+ "step": 2230
1572
+ },
1573
+ {
1574
+ "epoch": 0.6880663492551068,
1575
+ "grad_norm": 1.3197108507156372,
1576
+ "learning_rate": 4.69593385685064e-06,
1577
+ "loss": 1.0238,
1578
+ "step": 2240
1579
+ },
1580
+ {
1581
+ "epoch": 0.6911380740285671,
1582
+ "grad_norm": 1.4064984321594238,
1583
+ "learning_rate": 4.611863807223021e-06,
1584
+ "loss": 1.0283,
1585
+ "step": 2250
1586
+ },
1587
+ {
1588
+ "epoch": 0.6942097988020274,
1589
+ "grad_norm": 1.696387767791748,
1590
+ "learning_rate": 4.528326982563619e-06,
1591
+ "loss": 1.0002,
1592
+ "step": 2260
1593
+ },
1594
+ {
1595
+ "epoch": 0.6972815235754877,
1596
+ "grad_norm": 1.372271180152893,
1597
+ "learning_rate": 4.44533164990933e-06,
1598
+ "loss": 0.9299,
1599
+ "step": 2270
1600
+ },
1601
+ {
1602
+ "epoch": 0.700353248348948,
1603
+ "grad_norm": 1.4047715663909912,
1604
+ "learning_rate": 4.362886022709493e-06,
1605
+ "loss": 1.0112,
1606
+ "step": 2280
1607
+ },
1608
+ {
1609
+ "epoch": 0.7034249731224083,
1610
+ "grad_norm": 1.5955997705459595,
1611
+ "learning_rate": 4.280998260013043e-06,
1612
+ "loss": 0.9021,
1613
+ "step": 2290
1614
+ },
1615
+ {
1616
+ "epoch": 0.7064966978958686,
1617
+ "grad_norm": 1.2997745275497437,
1618
+ "learning_rate": 4.199676465661115e-06,
1619
+ "loss": 1.057,
1620
+ "step": 2300
1621
+ },
1622
+ {
1623
+ "epoch": 0.7095684226693288,
1624
+ "grad_norm": 1.4059574604034424,
1625
+ "learning_rate": 4.118928687485021e-06,
1626
+ "loss": 0.9787,
1627
+ "step": 2310
1628
+ },
1629
+ {
1630
+ "epoch": 0.7126401474427891,
1631
+ "grad_norm": 1.478273868560791,
1632
+ "learning_rate": 4.0387629165098485e-06,
1633
+ "loss": 0.9974,
1634
+ "step": 2320
1635
+ },
1636
+ {
1637
+ "epoch": 0.7157118722162494,
1638
+ "grad_norm": 1.3587843179702759,
1639
+ "learning_rate": 3.9591870861636214e-06,
1640
+ "loss": 1.0576,
1641
+ "step": 2330
1642
+ },
1643
+ {
1644
+ "epoch": 0.7187835969897097,
1645
+ "grad_norm": 1.487290620803833,
1646
+ "learning_rate": 3.880209071492195e-06,
1647
+ "loss": 0.999,
1648
+ "step": 2340
1649
+ },
1650
+ {
1651
+ "epoch": 0.72185532176317,
1652
+ "grad_norm": 1.4370648860931396,
1653
+ "learning_rate": 3.8018366883799263e-06,
1654
+ "loss": 1.0137,
1655
+ "step": 2350
1656
+ },
1657
+ {
1658
+ "epoch": 0.7249270465366303,
1659
+ "grad_norm": 1.2997723817825317,
1660
+ "learning_rate": 3.7240776927761825e-06,
1661
+ "loss": 0.9579,
1662
+ "step": 2360
1663
+ },
1664
+ {
1665
+ "epoch": 0.7279987713100906,
1666
+ "grad_norm": 1.533390760421753,
1667
+ "learning_rate": 3.6469397799277884e-06,
1668
+ "loss": 0.9849,
1669
+ "step": 2370
1670
+ },
1671
+ {
1672
+ "epoch": 0.7310704960835509,
1673
+ "grad_norm": 1.2424747943878174,
1674
+ "learning_rate": 3.5704305836175025e-06,
1675
+ "loss": 0.9922,
1676
+ "step": 2380
1677
+ },
1678
+ {
1679
+ "epoch": 0.7341422208570112,
1680
+ "grad_norm": 1.4729893207550049,
1681
+ "learning_rate": 3.4945576754085285e-06,
1682
+ "loss": 0.918,
1683
+ "step": 2390
1684
+ },
1685
+ {
1686
+ "epoch": 0.7372139456304715,
1687
+ "grad_norm": 1.9271929264068604,
1688
+ "learning_rate": 3.41932856389524e-06,
1689
+ "loss": 1.0052,
1690
+ "step": 2400
1691
+ },
1692
+ {
1693
+ "epoch": 0.7402856704039318,
1694
+ "grad_norm": 1.5550090074539185,
1695
+ "learning_rate": 3.344750693960088e-06,
1696
+ "loss": 0.9793,
1697
+ "step": 2410
1698
+ },
1699
+ {
1700
+ "epoch": 0.7433573951773921,
1701
+ "grad_norm": 1.2984689474105835,
1702
+ "learning_rate": 3.2708314460368417e-06,
1703
+ "loss": 1.0048,
1704
+ "step": 2420
1705
+ },
1706
+ {
1707
+ "epoch": 0.7464291199508524,
1708
+ "grad_norm": 1.3870617151260376,
1709
+ "learning_rate": 3.1975781353802095e-06,
1710
+ "loss": 0.9915,
1711
+ "step": 2430
1712
+ },
1713
+ {
1714
+ "epoch": 0.7495008447243127,
1715
+ "grad_norm": 1.4227845668792725,
1716
+ "learning_rate": 3.124998011341883e-06,
1717
+ "loss": 0.9949,
1718
+ "step": 2440
1719
+ },
1720
+ {
1721
+ "epoch": 0.752572569497773,
1722
+ "grad_norm": 1.5820941925048828,
1723
+ "learning_rate": 3.0530982566531374e-06,
1724
+ "loss": 0.9694,
1725
+ "step": 2450
1726
+ },
1727
+ {
1728
+ "epoch": 0.7556442942712333,
1729
+ "grad_norm": 2.8299295902252197,
1730
+ "learning_rate": 2.981885986713995e-06,
1731
+ "loss": 0.9964,
1732
+ "step": 2460
1733
+ },
1734
+ {
1735
+ "epoch": 0.7587160190446935,
1736
+ "grad_norm": 1.6240131855010986,
1737
+ "learning_rate": 2.911368248889078e-06,
1738
+ "loss": 0.9875,
1739
+ "step": 2470
1740
+ },
1741
+ {
1742
+ "epoch": 0.7617877438181538,
1743
+ "grad_norm": 1.4417290687561035,
1744
+ "learning_rate": 2.841552021810183e-06,
1745
+ "loss": 0.9615,
1746
+ "step": 2480
1747
+ },
1748
+ {
1749
+ "epoch": 0.7648594685916142,
1750
+ "grad_norm": 1.56830632686615,
1751
+ "learning_rate": 2.7724442146856266e-06,
1752
+ "loss": 0.9857,
1753
+ "step": 2490
1754
+ },
1755
+ {
1756
+ "epoch": 0.7679311933650745,
1757
+ "grad_norm": 1.7147358655929565,
1758
+ "learning_rate": 2.704051666616534e-06,
1759
+ "loss": 1.0245,
1760
+ "step": 2500
1761
+ },
1762
+ {
1763
+ "epoch": 0.7710029181385348,
1764
+ "grad_norm": 1.4760040044784546,
1765
+ "learning_rate": 2.6363811459199896e-06,
1766
+ "loss": 0.9871,
1767
+ "step": 2510
1768
+ },
1769
+ {
1770
+ "epoch": 0.7740746429119951,
1771
+ "grad_norm": 1.227241039276123,
1772
+ "learning_rate": 2.5694393494592475e-06,
1773
+ "loss": 0.9439,
1774
+ "step": 2520
1775
+ },
1776
+ {
1777
+ "epoch": 0.7771463676854554,
1778
+ "grad_norm": 1.5236525535583496,
1779
+ "learning_rate": 2.5032329019809733e-06,
1780
+ "loss": 0.9765,
1781
+ "step": 2530
1782
+ },
1783
+ {
1784
+ "epoch": 0.7802180924589157,
1785
+ "grad_norm": 1.4436039924621582,
1786
+ "learning_rate": 2.4377683554596465e-06,
1787
+ "loss": 1.0443,
1788
+ "step": 2540
1789
+ },
1790
+ {
1791
+ "epoch": 0.783289817232376,
1792
+ "grad_norm": 1.4698201417922974,
1793
+ "learning_rate": 2.3730521884491744e-06,
1794
+ "loss": 1.0435,
1795
+ "step": 2550
1796
+ },
1797
+ {
1798
+ "epoch": 0.7863615420058363,
1799
+ "grad_norm": 1.3048474788665771,
1800
+ "learning_rate": 2.3090908054417294e-06,
1801
+ "loss": 1.0267,
1802
+ "step": 2560
1803
+ },
1804
+ {
1805
+ "epoch": 0.7894332667792966,
1806
+ "grad_norm": 1.3633249998092651,
1807
+ "learning_rate": 2.24589053623396e-06,
1808
+ "loss": 0.9543,
1809
+ "step": 2570
1810
+ },
1811
+ {
1812
+ "epoch": 0.7925049915527569,
1813
+ "grad_norm": 1.4888864755630493,
1814
+ "learning_rate": 2.1834576353005786e-06,
1815
+ "loss": 0.9568,
1816
+ "step": 2580
1817
+ },
1818
+ {
1819
+ "epoch": 0.7955767163262172,
1820
+ "grad_norm": 1.5138416290283203,
1821
+ "learning_rate": 2.1217982811753855e-06,
1822
+ "loss": 1.006,
1823
+ "step": 2590
1824
+ },
1825
+ {
1826
+ "epoch": 0.7986484410996775,
1827
+ "grad_norm": 1.4148247241973877,
1828
+ "learning_rate": 2.0609185758398444e-06,
1829
+ "loss": 0.9452,
1830
+ "step": 2600
1831
+ },
1832
+ {
1833
+ "epoch": 0.8017201658731378,
1834
+ "grad_norm": 1.6830410957336426,
1835
+ "learning_rate": 2.0008245441191954e-06,
1836
+ "loss": 1.0418,
1837
+ "step": 2610
1838
+ },
1839
+ {
1840
+ "epoch": 0.8047918906465981,
1841
+ "grad_norm": 1.4986162185668945,
1842
+ "learning_rate": 1.9415221330862276e-06,
1843
+ "loss": 0.9715,
1844
+ "step": 2620
1845
+ },
1846
+ {
1847
+ "epoch": 0.8078636154200584,
1848
+ "grad_norm": 1.3727550506591797,
1849
+ "learning_rate": 1.8830172114727508e-06,
1850
+ "loss": 0.9621,
1851
+ "step": 2630
1852
+ },
1853
+ {
1854
+ "epoch": 0.8109353401935187,
1855
+ "grad_norm": 1.5304306745529175,
1856
+ "learning_rate": 1.8253155690887915e-06,
1857
+ "loss": 1.0468,
1858
+ "step": 2640
1859
+ },
1860
+ {
1861
+ "epoch": 0.8140070649669789,
1862
+ "grad_norm": 1.4285249710083008,
1863
+ "learning_rate": 1.768422916249626e-06,
1864
+ "loss": 1.0002,
1865
+ "step": 2650
1866
+ },
1867
+ {
1868
+ "epoch": 0.8170787897404392,
1869
+ "grad_norm": 1.4852147102355957,
1870
+ "learning_rate": 1.7123448832106793e-06,
1871
+ "loss": 0.9601,
1872
+ "step": 2660
1873
+ },
1874
+ {
1875
+ "epoch": 0.8201505145138995,
1876
+ "grad_norm": 1.228345274925232,
1877
+ "learning_rate": 1.6570870196103218e-06,
1878
+ "loss": 1.0149,
1879
+ "step": 2670
1880
+ },
1881
+ {
1882
+ "epoch": 0.8232222392873598,
1883
+ "grad_norm": 1.2908066511154175,
1884
+ "learning_rate": 1.6026547939206826e-06,
1885
+ "loss": 0.9846,
1886
+ "step": 2680
1887
+ },
1888
+ {
1889
+ "epoch": 0.8262939640608201,
1890
+ "grad_norm": 1.5527454614639282,
1891
+ "learning_rate": 1.5490535929064476e-06,
1892
+ "loss": 0.9648,
1893
+ "step": 2690
1894
+ },
1895
+ {
1896
+ "epoch": 0.8293656888342804,
1897
+ "grad_norm": 1.4668254852294922,
1898
+ "learning_rate": 1.4962887210917987e-06,
1899
+ "loss": 1.0775,
1900
+ "step": 2700
1901
+ },
1902
+ {
1903
+ "epoch": 0.8324374136077407,
1904
+ "grad_norm": 1.3337604999542236,
1905
+ "learning_rate": 1.444365400235448e-06,
1906
+ "loss": 0.9332,
1907
+ "step": 2710
1908
+ },
1909
+ {
1910
+ "epoch": 0.835509138381201,
1911
+ "grad_norm": 1.3232961893081665,
1912
+ "learning_rate": 1.3932887688138775e-06,
1913
+ "loss": 0.9896,
1914
+ "step": 2720
1915
+ },
1916
+ {
1917
+ "epoch": 0.8385808631546613,
1918
+ "grad_norm": 1.5601872205734253,
1919
+ "learning_rate": 1.3430638815128239e-06,
1920
+ "loss": 1.0039,
1921
+ "step": 2730
1922
+ },
1923
+ {
1924
+ "epoch": 0.8416525879281216,
1925
+ "grad_norm": 1.4922980070114136,
1926
+ "learning_rate": 1.2936957087270519e-06,
1927
+ "loss": 1.0156,
1928
+ "step": 2740
1929
+ },
1930
+ {
1931
+ "epoch": 0.844724312701582,
1932
+ "grad_norm": 1.5822229385375977,
1933
+ "learning_rate": 1.2451891360684764e-06,
1934
+ "loss": 0.9864,
1935
+ "step": 2750
1936
+ },
1937
+ {
1938
+ "epoch": 0.8477960374750423,
1939
+ "grad_norm": 1.2503588199615479,
1940
+ "learning_rate": 1.1975489638826609e-06,
1941
+ "loss": 0.9598,
1942
+ "step": 2760
1943
+ },
1944
+ {
1945
+ "epoch": 0.8508677622485026,
1946
+ "grad_norm": 1.364817500114441,
1947
+ "learning_rate": 1.1507799067737591e-06,
1948
+ "loss": 0.8933,
1949
+ "step": 2770
1950
+ },
1951
+ {
1952
+ "epoch": 0.8539394870219629,
1953
+ "grad_norm": 1.3507331609725952,
1954
+ "learning_rate": 1.1048865931379594e-06,
1955
+ "loss": 1.0543,
1956
+ "step": 2780
1957
+ },
1958
+ {
1959
+ "epoch": 0.8570112117954232,
1960
+ "grad_norm": 1.4149025678634644,
1961
+ "learning_rate": 1.059873564705427e-06,
1962
+ "loss": 0.9826,
1963
+ "step": 2790
1964
+ },
1965
+ {
1966
+ "epoch": 0.8600829365688835,
1967
+ "grad_norm": 1.4314866065979004,
1968
+ "learning_rate": 1.0157452760908604e-06,
1969
+ "loss": 1.0093,
1970
+ "step": 2800
1971
+ },
1972
+ {
1973
+ "epoch": 0.8631546613423438,
1974
+ "grad_norm": 1.4840718507766724,
1975
+ "learning_rate": 9.725060943526343e-07,
1976
+ "loss": 0.9949,
1977
+ "step": 2810
1978
+ },
1979
+ {
1980
+ "epoch": 0.866226386115804,
1981
+ "grad_norm": 1.3463382720947266,
1982
+ "learning_rate": 9.301602985606284e-07,
1983
+ "loss": 0.9574,
1984
+ "step": 2820
1985
+ },
1986
+ {
1987
+ "epoch": 0.8692981108892643,
1988
+ "grad_norm": 1.8905155658721924,
1989
+ "learning_rate": 8.887120793727677e-07,
1990
+ "loss": 1.0334,
1991
+ "step": 2830
1992
+ },
1993
+ {
1994
+ "epoch": 0.8723698356627246,
1995
+ "grad_norm": 1.4388149976730347,
1996
+ "learning_rate": 8.481655386202903e-07,
1997
+ "loss": 1.0218,
1998
+ "step": 2840
1999
+ },
2000
+ {
2001
+ "epoch": 0.8754415604361849,
2002
+ "grad_norm": 2.5872504711151123,
2003
+ "learning_rate": 8.08524688901825e-07,
2004
+ "loss": 1.0667,
2005
+ "step": 2850
2006
+ },
2007
+ {
2008
+ "epoch": 0.8785132852096452,
2009
+ "grad_norm": 1.5576486587524414,
2010
+ "learning_rate": 7.697934531862972e-07,
2011
+ "loss": 0.9662,
2012
+ "step": 2860
2013
+ },
2014
+ {
2015
+ "epoch": 0.8815850099831055,
2016
+ "grad_norm": 1.4606417417526245,
2017
+ "learning_rate": 7.319756644246878e-07,
2018
+ "loss": 0.9629,
2019
+ "step": 2870
2020
+ },
2021
+ {
2022
+ "epoch": 0.8846567347565658,
2023
+ "grad_norm": 1.4137325286865234,
2024
+ "learning_rate": 6.950750651707327e-07,
2025
+ "loss": 1.0736,
2026
+ "step": 2880
2027
+ },
2028
+ {
2029
+ "epoch": 0.8877284595300261,
2030
+ "grad_norm": 1.471315860748291,
2031
+ "learning_rate": 6.590953072105321e-07,
2032
+ "loss": 1.0666,
2033
+ "step": 2890
2034
+ },
2035
+ {
2036
+ "epoch": 0.8908001843034864,
2037
+ "grad_norm": 1.6415996551513672,
2038
+ "learning_rate": 6.240399512011664e-07,
2039
+ "loss": 1.0137,
2040
+ "step": 2900
2041
+ },
2042
+ {
2043
+ "epoch": 0.8938719090769467,
2044
+ "grad_norm": 1.6317228078842163,
2045
+ "learning_rate": 5.899124663183287e-07,
2046
+ "loss": 0.9415,
2047
+ "step": 2910
2048
+ },
2049
+ {
2050
+ "epoch": 0.896943633850407,
2051
+ "grad_norm": 1.4955772161483765,
2052
+ "learning_rate": 5.567162299129947e-07,
2053
+ "loss": 1.0241,
2054
+ "step": 2920
2055
+ },
2056
+ {
2057
+ "epoch": 0.9000153586238673,
2058
+ "grad_norm": 1.4452728033065796,
2059
+ "learning_rate": 5.244545271772016e-07,
2060
+ "loss": 0.9614,
2061
+ "step": 2930
2062
+ },
2063
+ {
2064
+ "epoch": 0.9030870833973276,
2065
+ "grad_norm": 1.4729427099227905,
2066
+ "learning_rate": 4.931305508189255e-07,
2067
+ "loss": 0.9674,
2068
+ "step": 2940
2069
+ },
2070
+ {
2071
+ "epoch": 0.9061588081707879,
2072
+ "grad_norm": 1.3615094423294067,
2073
+ "learning_rate": 4.6274740074613187e-07,
2074
+ "loss": 1.0474,
2075
+ "step": 2950
2076
+ },
2077
+ {
2078
+ "epoch": 0.9092305329442482,
2079
+ "grad_norm": 1.5536061525344849,
2080
+ "learning_rate": 4.33308083759999e-07,
2081
+ "loss": 0.9291,
2082
+ "step": 2960
2083
+ },
2084
+ {
2085
+ "epoch": 0.9123022577177085,
2086
+ "grad_norm": 1.582104206085205,
2087
+ "learning_rate": 4.0481551325734393e-07,
2088
+ "loss": 0.998,
2089
+ "step": 2970
2090
+ },
2091
+ {
2092
+ "epoch": 0.9153739824911687,
2093
+ "grad_norm": 1.5118730068206787,
2094
+ "learning_rate": 3.772725089423235e-07,
2095
+ "loss": 0.9718,
2096
+ "step": 2980
2097
+ },
2098
+ {
2099
+ "epoch": 0.918445707264629,
2100
+ "grad_norm": 1.7106198072433472,
2101
+ "learning_rate": 3.506817965473741e-07,
2102
+ "loss": 0.9625,
2103
+ "step": 2990
2104
+ },
2105
+ {
2106
+ "epoch": 0.9215174320380893,
2107
+ "grad_norm": 1.4147557020187378,
2108
+ "learning_rate": 3.2504600756347314e-07,
2109
+ "loss": 0.9654,
2110
+ "step": 3000
2111
+ },
2112
+ {
2113
+ "epoch": 0.9245891568115497,
2114
+ "grad_norm": 1.331018090248108,
2115
+ "learning_rate": 3.003676789797161e-07,
2116
+ "loss": 1.0091,
2117
+ "step": 3010
2118
+ },
2119
+ {
2120
+ "epoch": 0.92766088158501,
2121
+ "grad_norm": 1.653944969177246,
2122
+ "learning_rate": 2.7664925303224953e-07,
2123
+ "loss": 0.9399,
2124
+ "step": 3020
2125
+ },
2126
+ {
2127
+ "epoch": 0.9307326063584703,
2128
+ "grad_norm": 1.8000606298446655,
2129
+ "learning_rate": 2.5389307696258136e-07,
2130
+ "loss": 0.9671,
2131
+ "step": 3030
2132
+ },
2133
+ {
2134
+ "epoch": 0.9338043311319306,
2135
+ "grad_norm": 1.3829896450042725,
2136
+ "learning_rate": 2.321014027852908e-07,
2137
+ "loss": 1.0047,
2138
+ "step": 3040
2139
+ },
2140
+ {
2141
+ "epoch": 0.9368760559053909,
2142
+ "grad_norm": 1.5390887260437012,
2143
+ "learning_rate": 2.112763870651624e-07,
2144
+ "loss": 1.0249,
2145
+ "step": 3050
2146
+ },
2147
+ {
2148
+ "epoch": 0.9399477806788512,
2149
+ "grad_norm": 1.3523168563842773,
2150
+ "learning_rate": 1.9142009070377e-07,
2151
+ "loss": 0.922,
2152
+ "step": 3060
2153
+ },
2154
+ {
2155
+ "epoch": 0.9430195054523115,
2156
+ "grad_norm": 1.4008362293243408,
2157
+ "learning_rate": 1.7253447873551432e-07,
2158
+ "loss": 0.9748,
2159
+ "step": 3070
2160
+ },
2161
+ {
2162
+ "epoch": 0.9460912302257718,
2163
+ "grad_norm": 1.5816636085510254,
2164
+ "learning_rate": 1.5462142013317304e-07,
2165
+ "loss": 0.951,
2166
+ "step": 3080
2167
+ },
2168
+ {
2169
+ "epoch": 0.9491629549992321,
2170
+ "grad_norm": 1.5400006771087646,
2171
+ "learning_rate": 1.3768268762292537e-07,
2172
+ "loss": 0.9727,
2173
+ "step": 3090
2174
+ },
2175
+ {
2176
+ "epoch": 0.9522346797726924,
2177
+ "grad_norm": 1.4990243911743164,
2178
+ "learning_rate": 1.2171995750892896e-07,
2179
+ "loss": 1.0157,
2180
+ "step": 3100
2181
+ },
2182
+ {
2183
+ "epoch": 0.9553064045461527,
2184
+ "grad_norm": 1.3119021654129028,
2185
+ "learning_rate": 1.0673480950742831e-07,
2186
+ "loss": 1.0137,
2187
+ "step": 3110
2188
+ },
2189
+ {
2190
+ "epoch": 0.958378129319613,
2191
+ "grad_norm": 1.107062816619873,
2192
+ "learning_rate": 9.272872659041532e-08,
2193
+ "loss": 0.9122,
2194
+ "step": 3120
2195
+ },
2196
+ {
2197
+ "epoch": 0.9614498540930733,
2198
+ "grad_norm": 1.3335105180740356,
2199
+ "learning_rate": 7.970309483887329e-08,
2200
+ "loss": 0.9844,
2201
+ "step": 3130
2202
+ },
2203
+ {
2204
+ "epoch": 0.9645215788665336,
2205
+ "grad_norm": 1.968140721321106,
2206
+ "learning_rate": 6.765920330560894e-08,
2207
+ "loss": 1.0087,
2208
+ "step": 3140
2209
+ },
2210
+ {
2211
+ "epoch": 0.9675933036399939,
2212
+ "grad_norm": 1.6293946504592896,
2213
+ "learning_rate": 5.6598243887679984e-08,
2214
+ "loss": 0.9831,
2215
+ "step": 3150
2216
+ },
2217
+ {
2218
+ "epoch": 0.9706650284134541,
2219
+ "grad_norm": 1.341500163078308,
2220
+ "learning_rate": 4.652131120844727e-08,
2221
+ "loss": 1.016,
2222
+ "step": 3160
2223
+ },
2224
+ {
2225
+ "epoch": 0.9737367531869144,
2226
+ "grad_norm": 1.2992902994155884,
2227
+ "learning_rate": 3.74294025092381e-08,
2228
+ "loss": 0.9811,
2229
+ "step": 3170
2230
+ },
2231
+ {
2232
+ "epoch": 0.9768084779603747,
2233
+ "grad_norm": 1.4742531776428223,
2234
+ "learning_rate": 2.9323417550668475e-08,
2235
+ "loss": 0.9351,
2236
+ "step": 3180
2237
+ },
2238
+ {
2239
+ "epoch": 0.979880202733835,
2240
+ "grad_norm": 1.5771090984344482,
2241
+ "learning_rate": 2.2204158523592145e-08,
2242
+ "loss": 0.9744,
2243
+ "step": 3190
2244
+ },
2245
+ {
2246
+ "epoch": 0.9829519275072953,
2247
+ "grad_norm": 1.74678635597229,
2248
+ "learning_rate": 1.6072329969714085e-08,
2249
+ "loss": 0.9746,
2250
+ "step": 3200
2251
+ }
2252
+ ],
2253
+ "logging_steps": 10,
2254
+ "max_steps": 3256,
2255
+ "num_input_tokens_seen": 0,
2256
+ "num_train_epochs": 1,
2257
+ "save_steps": 200,
2258
+ "stateful_callbacks": {
2259
+ "TrainerControl": {
2260
+ "args": {
2261
+ "should_epoch_stop": false,
2262
+ "should_evaluate": false,
2263
+ "should_log": false,
2264
+ "should_save": true,
2265
+ "should_training_stop": false
2266
+ },
2267
+ "attributes": {}
2268
+ }
2269
+ },
2270
+ "total_flos": 2.7977296243889088e+17,
2271
+ "train_batch_size": 1,
2272
+ "trial_name": null,
2273
+ "trial_params": null
2274
+ }
checkpoint-3200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:427f9ef7775ae19943d0ad770e4668bee57f9626ec8f56e6b943d9567f7a0704
3
+ size 6097
checkpoint-3256/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/gpt-oss-20b-unsloth-bnb-4bit
3
+ library_name: peft
4
+ tags:
5
+ - base_model:adapter:unsloth/gpt-oss-20b-unsloth-bnb-4bit
6
+ - lora
7
+ - sft
8
+ - transformers
9
+ - trl
10
+ - unsloth
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.17.0
checkpoint-3256/adapter_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": {
4
+ "base_model_class": "GptOssForCausalLM",
5
+ "parent_library": "transformers.models.gpt_oss.modeling_gpt_oss"
6
+ },
7
+ "base_model_name_or_path": "unsloth/gpt-oss-20b-unsloth-bnb-4bit",
8
+ "bias": "none",
9
+ "corda_config": null,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "qalora_group_size": 16,
27
+ "r": 64,
28
+ "rank_pattern": {},
29
+ "revision": null,
30
+ "target_modules": "(?:.*?(?:vision|image|visual|patch|language|text).*?(?:self_attn|attention|attn|mlp|feed_forward|ffn|dense).*?(?:q_proj|k_proj|v_proj|o_proj|linear|0|1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31).*?)|(?:\\bmodel\\.layers\\.[\\d]{1,}\\.(?:self_attn|attention|attn|mlp|feed_forward|ffn|dense)\\.(?:(?:q_proj|k_proj|v_proj|o_proj|linear|0|1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31)))",
31
+ "target_parameters": null,
32
+ "task_type": null,
33
+ "trainable_token_indices": null,
34
+ "use_dora": false,
35
+ "use_qalora": false,
36
+ "use_rslora": false
37
+ }
checkpoint-3256/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:414df4075b7595bf56271e9ff50ba872ce5fed05ad3924bf14a8a78ebc6b8ebf
3
+ size 127427864
checkpoint-3256/chat_template.jinja ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #}
2
+ {#-
3
+ In addition to the normal inputs of `messages` and `tools`, this template also accepts the
4
+ following kwargs:
5
+ - "builtin_tools": A list, can contain "browser" and/or "python".
6
+ - "model_identity": A string that optionally describes the model identity.
7
+ - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium".
8
+ #}
9
+
10
+ {#- Tool Definition Rendering ============================================== #}
11
+ {%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%}
12
+ {%- if param_spec.type == "array" -%}
13
+ {%- if param_spec['items'] -%}
14
+ {%- if param_spec['items']['type'] == "string" -%}
15
+ {{- "string[]" }}
16
+ {%- elif param_spec['items']['type'] == "number" -%}
17
+ {{- "number[]" }}
18
+ {%- elif param_spec['items']['type'] == "integer" -%}
19
+ {{- "number[]" }}
20
+ {%- elif param_spec['items']['type'] == "boolean" -%}
21
+ {{- "boolean[]" }}
22
+ {%- else -%}
23
+ {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%}
24
+ {%- if inner_type == "object | object" or inner_type|length > 50 -%}
25
+ {{- "any[]" }}
26
+ {%- else -%}
27
+ {{- inner_type + "[]" }}
28
+ {%- endif -%}
29
+ {%- endif -%}
30
+ {%- if param_spec.nullable -%}
31
+ {{- " | null" }}
32
+ {%- endif -%}
33
+ {%- else -%}
34
+ {{- "any[]" }}
35
+ {%- if param_spec.nullable -%}
36
+ {{- " | null" }}
37
+ {%- endif -%}
38
+ {%- endif -%}
39
+ {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%}
40
+ {#- Handle array of types like ["object", "object"] from Union[dict, list] #}
41
+ {%- if param_spec.type | length > 1 -%}
42
+ {{- param_spec.type | join(" | ") }}
43
+ {%- else -%}
44
+ {{- param_spec.type[0] }}
45
+ {%- endif -%}
46
+ {%- elif param_spec.oneOf -%}
47
+ {#- Handle oneOf schemas - check for complex unions and fallback to any #}
48
+ {%- set has_object_variants = false -%}
49
+ {%- for variant in param_spec.oneOf -%}
50
+ {%- if variant.type == "object" -%}
51
+ {%- set has_object_variants = true -%}
52
+ {%- endif -%}
53
+ {%- endfor -%}
54
+ {%- if has_object_variants and param_spec.oneOf|length > 1 -%}
55
+ {{- "any" }}
56
+ {%- else -%}
57
+ {%- for variant in param_spec.oneOf -%}
58
+ {{- render_typescript_type(variant, required_params) -}}
59
+ {%- if variant.description %}
60
+ {{- "// " + variant.description }}
61
+ {%- endif -%}
62
+ {%- if variant.default is defined %}
63
+ {{ "// default: " + variant.default|tojson }}
64
+ {%- endif -%}
65
+ {%- if not loop.last %}
66
+ {{- " | " }}
67
+ {% endif -%}
68
+ {%- endfor -%}
69
+ {%- endif -%}
70
+ {%- elif param_spec.type == "string" -%}
71
+ {%- if param_spec.enum -%}
72
+ {{- '"' + param_spec.enum|join('" | "') + '"' -}}
73
+ {%- else -%}
74
+ {{- "string" }}
75
+ {%- if param_spec.nullable %}
76
+ {{- " | null" }}
77
+ {%- endif -%}
78
+ {%- endif -%}
79
+ {%- elif param_spec.type == "number" -%}
80
+ {{- "number" }}
81
+ {%- elif param_spec.type == "integer" -%}
82
+ {{- "number" }}
83
+ {%- elif param_spec.type == "boolean" -%}
84
+ {{- "boolean" }}
85
+
86
+ {%- elif param_spec.type == "object" -%}
87
+ {%- if param_spec.properties -%}
88
+ {{- "{\n" }}
89
+ {%- for prop_name, prop_spec in param_spec.properties.items() -%}
90
+ {{- prop_name -}}
91
+ {%- if prop_name not in (param_spec.required or []) -%}
92
+ {{- "?" }}
93
+ {%- endif -%}
94
+ {{- ": " }}
95
+ {{ render_typescript_type(prop_spec, param_spec.required or []) }}
96
+ {%- if not loop.last -%}
97
+ {{-", " }}
98
+ {%- endif -%}
99
+ {%- endfor -%}
100
+ {{- "}" }}
101
+ {%- else -%}
102
+ {{- "object" }}
103
+ {%- endif -%}
104
+ {%- else -%}
105
+ {{- "any" }}
106
+ {%- endif -%}
107
+ {%- endmacro -%}
108
+
109
+ {%- macro render_tool_namespace(namespace_name, tools) -%}
110
+ {{- "## " + namespace_name + "\n\n" }}
111
+ {{- "namespace " + namespace_name + " {\n\n" }}
112
+ {%- for tool in tools %}
113
+ {%- set tool = tool.function %}
114
+ {{- "// " + tool.description + "\n" }}
115
+ {{- "type "+ tool.name + " = " }}
116
+ {%- if tool.parameters and tool.parameters.properties -%}
117
+ {{- "(_: " }}
118
+ {{- "{\n" }}
119
+ {%- for param_name, param_spec in tool.parameters.properties.items() %}
120
+ {{- "// " + param_spec.description + "\n" }}
121
+ {{- param_name }}
122
+ {%- if param_name not in (tool.parameters.required or []) -%}
123
+ {{- "?" }}
124
+ {%- endif -%}
125
+ {{- ": " }}
126
+ {{- render_typescript_type(param_spec, tool.parameters.required or []) }}
127
+ {%- if param_spec.default is defined -%}
128
+ {%- if param_spec.enum %}
129
+ {{- ", // default: " + param_spec.default }}
130
+ {%- elif param_spec.oneOf %}
131
+ {{- "// default: " + param_spec.default }}
132
+ {%- else %}
133
+ {{- ", // default: " + param_spec.default|tojson }}
134
+ {%- endif -%}
135
+ {%- endif -%}
136
+ {%- if not loop.last %}
137
+ {{- ",\n" }}
138
+ {%- else %}
139
+ {{- "\n" }}
140
+ {%- endif -%}
141
+ {%- endfor %}
142
+ {{- "}) => any;\n\n" }}
143
+ {%- else -%}
144
+ {{- "() => any;\n\n" }}
145
+ {%- endif -%}
146
+ {%- endfor %}
147
+ {{- "} // namespace " + namespace_name }}
148
+ {%- endmacro -%}
149
+
150
+ {%- macro render_builtin_tools(browser_tool, python_tool) -%}
151
+ {%- if browser_tool %}
152
+ {{- "## browser\n\n" }}
153
+ {{- "// Tool for browsing.\n" }}
154
+ {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }}
155
+ {{- "// Cite information from the tool using the following format:\n" }}
156
+ {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }}
157
+ {{- "// Do not quote more than 10 words directly from the tool output.\n" }}
158
+ {{- "// sources=web (default: web)\n" }}
159
+ {{- "namespace browser {\n\n" }}
160
+ {{- "// Searches for information related to `query` and displays `topn` results.\n" }}
161
+ {{- "type search = (_: {\n" }}
162
+ {{- "query: string,\n" }}
163
+ {{- "topn?: number, // default: 10\n" }}
164
+ {{- "source?: string,\n" }}
165
+ {{- "}) => any;\n\n" }}
166
+ {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }}
167
+ {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }}
168
+ {{- "// If `cursor` is not provided, the most recent page is implied.\n" }}
169
+ {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }}
170
+ {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }}
171
+ {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }}
172
+ {{- "type open = (_: {\n" }}
173
+ {{- "id?: number | string, // default: -1\n" }}
174
+ {{- "cursor?: number, // default: -1\n" }}
175
+ {{- "loc?: number, // default: -1\n" }}
176
+ {{- "num_lines?: number, // default: -1\n" }}
177
+ {{- "view_source?: boolean, // default: false\n" }}
178
+ {{- "source?: string,\n" }}
179
+ {{- "}) => any;\n\n" }}
180
+ {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }}
181
+ {{- "type find = (_: {\n" }}
182
+ {{- "pattern: string,\n" }}
183
+ {{- "cursor?: number, // default: -1\n" }}
184
+ {{- "}) => any;\n\n" }}
185
+ {{- "} // namespace browser\n\n" }}
186
+ {%- endif -%}
187
+
188
+ {%- if python_tool %}
189
+ {{- "## python\n\n" }}
190
+ {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }}
191
+ {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }}
192
+ {%- endif -%}
193
+ {%- endmacro -%}
194
+
195
+ {#- System Message Construction ============================================ #}
196
+ {%- macro build_system_message() -%}
197
+ {%- if model_identity is not defined %}
198
+ {{- "You are ChatGPT, a large language model trained by OpenAI.\n" -}}
199
+ {%- else %}
200
+ {{- model_identity }}
201
+ {%- endif %}
202
+ {{- "Knowledge cutoff: 2024-06\n" }}
203
+ {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }}
204
+ {%- if reasoning_effort is not defined %}
205
+ {%- set reasoning_effort = "medium" %}
206
+ {%- endif %}
207
+ {{- "Reasoning: " + reasoning_effort + "\n\n" }}
208
+ {%- if builtin_tools is defined %}
209
+ {{- "# Tools\n\n" }}
210
+ {%- set available_builtin_tools = namespace(browser=false, python=false) %}
211
+ {%- for tool in builtin_tools %}
212
+ {%- if tool == "browser" %}
213
+ {%- set available_builtin_tools.browser = true %}
214
+ {%- elif tool == "python" %}
215
+ {%- set available_builtin_tools.python = true %}
216
+ {%- endif %}
217
+ {%- endfor %}
218
+ {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }}
219
+ {%- endif -%}
220
+ {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }}
221
+ {%- if tools is defined -%}
222
+ {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }}
223
+ {%- endif -%}
224
+ {%- endmacro -%}
225
+
226
+ {#- Main Template Logic ================================================= #}
227
+ {#- Set defaults #}
228
+
229
+ {#- Render system message #}
230
+ {{- "<|start|>system<|message|>" }}
231
+ {{- build_system_message() }}
232
+ {{- "<|end|>" }}
233
+
234
+ {#- Extract developer message #}
235
+ {%- if messages[0].role == "developer" or messages[0].role == "system" %}
236
+ {%- set developer_message = messages[0].content %}
237
+ {%- set loop_messages = messages[1:] %}
238
+ {%- else %}
239
+ {%- set developer_message = "" %}
240
+ {%- set loop_messages = messages %}
241
+ {%- endif %}
242
+
243
+ {#- Render developer message #}
244
+ {%- if developer_message or tools %}
245
+ {{- "<|start|>developer<|message|>" }}
246
+ {%- if developer_message %}
247
+ {{- "# Instructions\n\n" }}
248
+ {{- developer_message }}
249
+ {%- endif %}
250
+ {%- if tools -%}
251
+ {{- "\n\n" }}
252
+ {{- "# Tools\n\n" }}
253
+ {{- render_tool_namespace("functions", tools) }}
254
+ {%- endif -%}
255
+ {{- "<|end|>" }}
256
+ {%- endif %}
257
+
258
+ {#- Render messages #}
259
+ {%- set last_tool_call = namespace(name=none) %}
260
+ {%- for message in loop_messages -%}
261
+ {#- At this point only assistant/user/tool messages should remain #}
262
+ {%- if message.role == 'assistant' -%}
263
+ {%- if "tool_calls" in message %}
264
+ {#- We assume max 1 tool call per message, and so we infer the tool call name #}
265
+ {#- in "tool" messages from the most recent assistant tool call name #}
266
+ {%- set tool_call = message.tool_calls[0] %}
267
+ {%- if tool_call.function %}
268
+ {%- set tool_call = tool_call.function %}
269
+ {%- endif %}
270
+ {%- if message.content %}
271
+ {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }}
272
+ {%- endif %}
273
+ {{- "<|start|>assistant to=" }}
274
+ {{- "functions." + tool_call.name + "<|channel|>commentary json<|message|>" }}
275
+ {{- tool_call.arguments|tojson }}
276
+ {{- "<|call|>" }}
277
+ {%- set last_tool_call.name = tool_call.name %}
278
+ {%- elif "thinking" in message and loop.last and not add_generation_prompt %}
279
+ {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #}
280
+ {#- This is a situation that should only occur in training, never in inference. #}
281
+ {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }}
282
+ {#- <|return|> indicates the end of generation, but <|end|> does not #}
283
+ {#- <|return|> should never be an input to the model, but we include it as the final token #}
284
+ {#- when training, so the model learns to emit it. #}
285
+ {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }}
286
+ {%- set last_tool_call.name = none %}
287
+ {%- elif "thinking" in message %}
288
+ {#- CoT is dropped during all previous turns, so we never render it for inference #}
289
+ {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }}
290
+ {%- set last_tool_call.name = none %}
291
+ {%- elif loop.last and not add_generation_prompt %}
292
+ {#- <|return|> indicates the end of generation, but <|end|> does not #}
293
+ {#- <|return|> should never be an input to the model, but we include it as the final token #}
294
+ {#- when training, so the model learns to emit it. #}
295
+ {{- "<|start|>assistant<|message|>" + message.content + "<|return|>" }}
296
+ {%- else %}
297
+ {{- "<|start|>assistant<|message|>" + message.content + "<|end|>" }}
298
+ {%- set last_tool_call.name = none %}
299
+ {%- endif %}
300
+ {%- elif message.role == 'tool' -%}
301
+ {%- if last_tool_call.name is none %}
302
+ {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
303
+ {%- endif %}
304
+ {{- "<|start|>functions." + last_tool_call.name }}
305
+ {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }}
306
+ {%- else -%}
307
+ {{- "<|start|>user<|message|>" + message.content + "<|end|>" }}
308
+ {%- endif -%}
309
+ {%- endfor -%}
310
+
311
+ {#- Generation prompt #}
312
+ {%- if add_generation_prompt -%}
313
+ <|start|>assistant
314
+ {%- endif -%}
315
+ {# Copyright 2025-present Unsloth. Apache 2.0 License. Unsloth chat template fixes. Edited from ggml-org & OpenAI #}
checkpoint-3256/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:902e72e3846a365200e8b5159cb3c0d20b7000b1a61876d43de2f2e0925ab6ac
3
+ size 64923339
checkpoint-3256/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07001ef67086ea010ded154f7ef5090a3fc694d1aa02d5f1fcb1aebc6962eb6f
3
+ size 14645
checkpoint-3256/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8cccbe9e8663ad9f8c7ba7a68bd1185ec3c1735a28a45a322ea0206819df29e
3
+ size 1465
checkpoint-3256/special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|return|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|reserved_200017|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
checkpoint-3256/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d54615cba5113384c7495974d13feabc433e3e27e9262ac6a1a77f762a48d1c8
3
+ size 27868273
checkpoint-3256/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd9f6c1cec89c00a50e0eea8e27f8feadc89a155d49368d7a76e7de1f462cff5
3
+ size 4229
checkpoint-3256/trainer_state.json ADDED
@@ -0,0 +1,2309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 3256,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.003071724773460298,
14
+ "grad_norm": 8.419622421264648,
15
+ "learning_rate": 1.8367346938775512e-06,
16
+ "loss": 7.0015,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.006143449546920596,
21
+ "grad_norm": 10.31521224975586,
22
+ "learning_rate": 3.877551020408164e-06,
23
+ "loss": 6.9781,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.009215174320380893,
28
+ "grad_norm": 11.546846389770508,
29
+ "learning_rate": 5.918367346938776e-06,
30
+ "loss": 6.8691,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.012286899093841192,
35
+ "grad_norm": 6.222589015960693,
36
+ "learning_rate": 7.959183673469388e-06,
37
+ "loss": 6.2584,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.01535862386730149,
42
+ "grad_norm": 5.404016971588135,
43
+ "learning_rate": 1e-05,
44
+ "loss": 5.5826,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.018430348640761787,
49
+ "grad_norm": 3.200059652328491,
50
+ "learning_rate": 1.2040816326530614e-05,
51
+ "loss": 4.7731,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.021502073414222087,
56
+ "grad_norm": 2.567652463912964,
57
+ "learning_rate": 1.4081632653061225e-05,
58
+ "loss": 4.1807,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.024573798187682384,
63
+ "grad_norm": 2.171948194503784,
64
+ "learning_rate": 1.612244897959184e-05,
65
+ "loss": 3.6721,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.02764552296114268,
70
+ "grad_norm": 1.7318979501724243,
71
+ "learning_rate": 1.816326530612245e-05,
72
+ "loss": 3.2164,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.03071724773460298,
77
+ "grad_norm": 1.588425636291504,
78
+ "learning_rate": 1.9999995051820308e-05,
79
+ "loss": 2.8804,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.03378897250806328,
84
+ "grad_norm": 1.8087046146392822,
85
+ "learning_rate": 1.9999401276182468e-05,
86
+ "loss": 2.4881,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.03686069728152357,
91
+ "grad_norm": 1.8364018201828003,
92
+ "learning_rate": 1.999781793193742e-05,
93
+ "loss": 2.0435,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.039932422054983874,
98
+ "grad_norm": 1.41903817653656,
99
+ "learning_rate": 1.9995245175777322e-05,
100
+ "loss": 1.8227,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.043004146828444174,
105
+ "grad_norm": 1.4173569679260254,
106
+ "learning_rate": 1.9991683262309292e-05,
107
+ "loss": 1.5215,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.04607587160190447,
112
+ "grad_norm": 1.2530128955841064,
113
+ "learning_rate": 1.998713254403021e-05,
114
+ "loss": 1.4296,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.04914759637536477,
119
+ "grad_norm": 0.9256352782249451,
120
+ "learning_rate": 1.9981593471291828e-05,
121
+ "loss": 1.3521,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.05221932114882506,
126
+ "grad_norm": 1.2156378030776978,
127
+ "learning_rate": 1.9975066592256226e-05,
128
+ "loss": 1.2796,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.05529104592228536,
133
+ "grad_norm": 0.7624578475952148,
134
+ "learning_rate": 1.996755255284153e-05,
135
+ "loss": 1.2564,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.058362770695745664,
140
+ "grad_norm": 0.8461359739303589,
141
+ "learning_rate": 1.9959052096658015e-05,
142
+ "loss": 1.2598,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.06143449546920596,
147
+ "grad_norm": 0.7803846597671509,
148
+ "learning_rate": 1.99495660649345e-05,
149
+ "loss": 1.292,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.06450622024266625,
154
+ "grad_norm": 0.8108134865760803,
155
+ "learning_rate": 1.9939095396435123e-05,
156
+ "loss": 1.274,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.06757794501612656,
161
+ "grad_norm": 0.6554253697395325,
162
+ "learning_rate": 1.99276411273664e-05,
163
+ "loss": 1.1686,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.07064966978958685,
168
+ "grad_norm": 0.7851625084877014,
169
+ "learning_rate": 1.991520439127471e-05,
170
+ "loss": 1.2176,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.07372139456304715,
175
+ "grad_norm": 0.933299720287323,
176
+ "learning_rate": 1.9901786418934105e-05,
177
+ "loss": 1.2398,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.07679311933650745,
182
+ "grad_norm": 0.8452703356742859,
183
+ "learning_rate": 1.9887388538224504e-05,
184
+ "loss": 1.2785,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.07986484410996775,
189
+ "grad_norm": 0.7061429619789124,
190
+ "learning_rate": 1.9872012174000298e-05,
191
+ "loss": 1.1552,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.08293656888342804,
196
+ "grad_norm": 0.6826983690261841,
197
+ "learning_rate": 1.9855658847949324e-05,
198
+ "loss": 1.0678,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.08600829365688835,
203
+ "grad_norm": 0.7688930630683899,
204
+ "learning_rate": 1.9838330178442288e-05,
205
+ "loss": 1.2263,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.08908001843034864,
210
+ "grad_norm": 0.725660502910614,
211
+ "learning_rate": 1.9820027880372598e-05,
212
+ "loss": 1.1576,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.09215174320380894,
217
+ "grad_norm": 0.8614407181739807,
218
+ "learning_rate": 1.980075376498666e-05,
219
+ "loss": 1.1923,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.09522346797726923,
224
+ "grad_norm": 0.7618085145950317,
225
+ "learning_rate": 1.9780509739704623e-05,
226
+ "loss": 1.2018,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.09829519275072954,
231
+ "grad_norm": 0.8258978724479675,
232
+ "learning_rate": 1.9759297807931634e-05,
233
+ "loss": 1.1869,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.10136691752418983,
238
+ "grad_norm": 0.8120176196098328,
239
+ "learning_rate": 1.9737120068859546e-05,
240
+ "loss": 1.1871,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.10443864229765012,
245
+ "grad_norm": 0.6733288168907166,
246
+ "learning_rate": 1.9713978717259207e-05,
247
+ "loss": 1.1853,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.10751036707111043,
252
+ "grad_norm": 0.7299067974090576,
253
+ "learning_rate": 1.9689876043263238e-05,
254
+ "loss": 1.1829,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.11058209184457073,
259
+ "grad_norm": 0.8712752461433411,
260
+ "learning_rate": 1.9664814432139408e-05,
261
+ "loss": 1.1995,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.11365381661803102,
266
+ "grad_norm": 0.9290866255760193,
267
+ "learning_rate": 1.9638796364054566e-05,
268
+ "loss": 1.1771,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.11672554139149133,
273
+ "grad_norm": 0.7293970584869385,
274
+ "learning_rate": 1.9611824413829215e-05,
275
+ "loss": 1.134,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.11979726616495162,
280
+ "grad_norm": 1.0634404420852661,
281
+ "learning_rate": 1.9583901250682687e-05,
282
+ "loss": 1.1265,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.12286899093841191,
287
+ "grad_norm": 0.9366344213485718,
288
+ "learning_rate": 1.9555029637969005e-05,
289
+ "loss": 1.1802,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.1259407157118722,
294
+ "grad_norm": 0.8466936945915222,
295
+ "learning_rate": 1.9525212432903388e-05,
296
+ "loss": 1.2034,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.1290124404853325,
301
+ "grad_norm": 0.7974659204483032,
302
+ "learning_rate": 1.9494452586279516e-05,
303
+ "loss": 1.1439,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.13208416525879282,
308
+ "grad_norm": 1.080421805381775,
309
+ "learning_rate": 1.9462753142177507e-05,
310
+ "loss": 1.12,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.13515589003225312,
315
+ "grad_norm": 0.9817642569541931,
316
+ "learning_rate": 1.9430117237662654e-05,
317
+ "loss": 1.1423,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.1382276148057134,
322
+ "grad_norm": 0.8396655917167664,
323
+ "learning_rate": 1.9396548102474992e-05,
324
+ "loss": 1.146,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.1412993395791737,
329
+ "grad_norm": 0.8537242412567139,
330
+ "learning_rate": 1.936204905870966e-05,
331
+ "loss": 1.0609,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.144371064352634,
336
+ "grad_norm": 0.7413896918296814,
337
+ "learning_rate": 1.932662352048813e-05,
338
+ "loss": 1.1508,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.1474427891260943,
343
+ "grad_norm": 0.9202362895011902,
344
+ "learning_rate": 1.929027499362036e-05,
345
+ "loss": 1.1131,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.1505145138995546,
350
+ "grad_norm": 1.1642792224884033,
351
+ "learning_rate": 1.9253007075257833e-05,
352
+ "loss": 1.1172,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.1535862386730149,
357
+ "grad_norm": 0.8923482894897461,
358
+ "learning_rate": 1.9214823453537568e-05,
359
+ "loss": 1.1486,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.1566579634464752,
364
+ "grad_norm": 0.8619294166564941,
365
+ "learning_rate": 1.9175727907217153e-05,
366
+ "loss": 1.0841,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.1597296882199355,
371
+ "grad_norm": 0.8567745089530945,
372
+ "learning_rate": 1.9135724305300757e-05,
373
+ "loss": 1.2078,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.1628014129933958,
378
+ "grad_norm": 0.924022912979126,
379
+ "learning_rate": 1.9094816606656272e-05,
380
+ "loss": 1.0892,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.16587313776685608,
385
+ "grad_norm": 1.2680057287216187,
386
+ "learning_rate": 1.9053008859623527e-05,
387
+ "loss": 1.1733,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.16894486254031638,
392
+ "grad_norm": 0.9168750643730164,
393
+ "learning_rate": 1.9010305201613625e-05,
394
+ "loss": 1.0919,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.1720165873137767,
399
+ "grad_norm": 1.0453463792800903,
400
+ "learning_rate": 1.8966709858699542e-05,
401
+ "loss": 1.1425,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.175088312087237,
406
+ "grad_norm": 0.9679042100906372,
407
+ "learning_rate": 1.8922227145197856e-05,
408
+ "loss": 1.1223,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.17816003686069729,
413
+ "grad_norm": 1.0852516889572144,
414
+ "learning_rate": 1.887686146324182e-05,
415
+ "loss": 1.09,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.18123176163415758,
420
+ "grad_norm": 1.0620839595794678,
421
+ "learning_rate": 1.8830617302345706e-05,
422
+ "loss": 1.1012,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.18430348640761787,
427
+ "grad_norm": 0.8156954646110535,
428
+ "learning_rate": 1.8783499238960495e-05,
429
+ "loss": 1.0889,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 0.18737521118107817,
434
+ "grad_norm": 1.028428077697754,
435
+ "learning_rate": 1.8735511936021016e-05,
436
+ "loss": 1.1586,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.19044693595453846,
441
+ "grad_norm": 0.893195629119873,
442
+ "learning_rate": 1.8686660142484446e-05,
443
+ "loss": 1.0948,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 0.19351866072799878,
448
+ "grad_norm": 1.1205710172653198,
449
+ "learning_rate": 1.8636948692860373e-05,
450
+ "loss": 1.141,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 0.19659038550145908,
455
+ "grad_norm": 0.8362760543823242,
456
+ "learning_rate": 1.8586382506732334e-05,
457
+ "loss": 1.0666,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 0.19966211027491937,
462
+ "grad_norm": 0.8918741345405579,
463
+ "learning_rate": 1.8534966588270987e-05,
464
+ "loss": 1.0774,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 0.20273383504837966,
469
+ "grad_norm": 1.0254731178283691,
470
+ "learning_rate": 1.8482706025738856e-05,
471
+ "loss": 1.0671,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 0.20580555982183996,
476
+ "grad_norm": 0.9713171720504761,
477
+ "learning_rate": 1.8429605990986797e-05,
478
+ "loss": 1.1728,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 0.20887728459530025,
483
+ "grad_norm": 1.1824350357055664,
484
+ "learning_rate": 1.8375671738942183e-05,
485
+ "loss": 1.0991,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 0.21194900936876057,
490
+ "grad_norm": 0.9485971331596375,
491
+ "learning_rate": 1.8320908607088847e-05,
492
+ "loss": 1.1219,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 0.21502073414222087,
497
+ "grad_norm": 0.9417327642440796,
498
+ "learning_rate": 1.8265322014938883e-05,
499
+ "loss": 1.093,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 0.21809245891568116,
504
+ "grad_norm": 1.0645897388458252,
505
+ "learning_rate": 1.82089174634963e-05,
506
+ "loss": 1.0401,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 0.22116418368914145,
511
+ "grad_norm": 1.0113904476165771,
512
+ "learning_rate": 1.815170053471265e-05,
513
+ "loss": 1.0896,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 0.22423590846260175,
518
+ "grad_norm": 0.988387405872345,
519
+ "learning_rate": 1.80936768909346e-05,
520
+ "loss": 1.1149,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 0.22730763323606204,
525
+ "grad_norm": 1.0920623540878296,
526
+ "learning_rate": 1.8034852274343585e-05,
527
+ "loss": 1.1012,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 0.23037935800952233,
532
+ "grad_norm": 1.0729286670684814,
533
+ "learning_rate": 1.797523250638754e-05,
534
+ "loss": 1.1034,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 0.23345108278298266,
539
+ "grad_norm": 1.054861068725586,
540
+ "learning_rate": 1.7914823487204796e-05,
541
+ "loss": 1.1171,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 0.23652280755644295,
546
+ "grad_norm": 1.0314829349517822,
547
+ "learning_rate": 1.7853631195040178e-05,
548
+ "loss": 1.0606,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 0.23959453232990324,
553
+ "grad_norm": 1.354514718055725,
554
+ "learning_rate": 1.7791661685653395e-05,
555
+ "loss": 1.0771,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 0.24266625710336354,
560
+ "grad_norm": 1.1574825048446655,
561
+ "learning_rate": 1.7728921091719733e-05,
562
+ "loss": 1.0866,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 0.24573798187682383,
567
+ "grad_norm": 1.0411062240600586,
568
+ "learning_rate": 1.7665415622223155e-05,
569
+ "loss": 1.0832,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 0.24880970665028412,
574
+ "grad_norm": 1.0036734342575073,
575
+ "learning_rate": 1.760115156184184e-05,
576
+ "loss": 1.0529,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 0.2518814314237444,
581
+ "grad_norm": 1.2075008153915405,
582
+ "learning_rate": 1.753613527032623e-05,
583
+ "loss": 1.1115,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 0.2549531561972047,
588
+ "grad_norm": 1.1035667657852173,
589
+ "learning_rate": 1.7470373181869667e-05,
590
+ "loss": 1.0905,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 0.258024880970665,
595
+ "grad_norm": 1.0951581001281738,
596
+ "learning_rate": 1.740387180447162e-05,
597
+ "loss": 1.0305,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 0.26109660574412535,
602
+ "grad_norm": 1.2153475284576416,
603
+ "learning_rate": 1.7336637719293667e-05,
604
+ "loss": 1.0678,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 0.26416833051758565,
609
+ "grad_norm": 1.0323575735092163,
610
+ "learning_rate": 1.726867758000818e-05,
611
+ "loss": 1.041,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 0.26724005529104594,
616
+ "grad_norm": 1.111869215965271,
617
+ "learning_rate": 1.7199998112139863e-05,
618
+ "loss": 1.0982,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 0.27031178006450624,
623
+ "grad_norm": 1.1253854036331177,
624
+ "learning_rate": 1.71306061124002e-05,
625
+ "loss": 1.1006,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 0.27338350483796653,
630
+ "grad_norm": 1.165722131729126,
631
+ "learning_rate": 1.706050844801479e-05,
632
+ "loss": 1.0595,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 0.2764552296114268,
637
+ "grad_norm": 1.0169954299926758,
638
+ "learning_rate": 1.6989712056043786e-05,
639
+ "loss": 1.0459,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 0.2795269543848871,
644
+ "grad_norm": 1.1502857208251953,
645
+ "learning_rate": 1.6918223942695374e-05,
646
+ "loss": 1.0645,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 0.2825986791583474,
651
+ "grad_norm": 0.9404005408287048,
652
+ "learning_rate": 1.6846051182632396e-05,
653
+ "loss": 1.0838,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 0.2856704039318077,
658
+ "grad_norm": 0.9998802542686462,
659
+ "learning_rate": 1.6773200918272257e-05,
660
+ "loss": 1.0573,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 0.288742128705268,
665
+ "grad_norm": 1.1572364568710327,
666
+ "learning_rate": 1.6699680359080066e-05,
667
+ "loss": 1.0408,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 0.2918138534787283,
672
+ "grad_norm": 1.0092674493789673,
673
+ "learning_rate": 1.662549678085518e-05,
674
+ "loss": 1.0499,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 0.2948855782521886,
679
+ "grad_norm": 1.1230064630508423,
680
+ "learning_rate": 1.6550657525011163e-05,
681
+ "loss": 1.0278,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 0.2979573030256489,
686
+ "grad_norm": 1.0530235767364502,
687
+ "learning_rate": 1.6475169997849267e-05,
688
+ "loss": 1.0977,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 0.3010290277991092,
693
+ "grad_norm": 1.2637819051742554,
694
+ "learning_rate": 1.6399041669825478e-05,
695
+ "loss": 1.0096,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 0.3041007525725695,
700
+ "grad_norm": 1.0472123622894287,
701
+ "learning_rate": 1.632228007481122e-05,
702
+ "loss": 1.0773,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 0.3071724773460298,
707
+ "grad_norm": 1.0698893070220947,
708
+ "learning_rate": 1.624489280934778e-05,
709
+ "loss": 1.0724,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 0.3102442021194901,
714
+ "grad_norm": 1.4538482427597046,
715
+ "learning_rate": 1.616688753189454e-05,
716
+ "loss": 1.0787,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 0.3133159268929504,
721
+ "grad_norm": 1.0167417526245117,
722
+ "learning_rate": 1.6088271962071067e-05,
723
+ "loss": 1.1064,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 0.3163876516664107,
728
+ "grad_norm": 1.1629698276519775,
729
+ "learning_rate": 1.6009053879893164e-05,
730
+ "loss": 1.0071,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 0.319459376439871,
735
+ "grad_norm": 1.2197659015655518,
736
+ "learning_rate": 1.5929241125002936e-05,
737
+ "loss": 1.0701,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 0.3225311012133313,
742
+ "grad_norm": 1.2297234535217285,
743
+ "learning_rate": 1.584884159589295e-05,
744
+ "loss": 1.0286,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 0.3256028259867916,
749
+ "grad_norm": 1.1668790578842163,
750
+ "learning_rate": 1.5767863249124588e-05,
751
+ "loss": 1.0598,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 0.32867455076025187,
756
+ "grad_norm": 1.219712257385254,
757
+ "learning_rate": 1.5686314098540643e-05,
758
+ "loss": 1.0586,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 0.33174627553371216,
763
+ "grad_norm": 1.072532057762146,
764
+ "learning_rate": 1.560420221447224e-05,
765
+ "loss": 1.0879,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 0.33481800030717246,
770
+ "grad_norm": 1.2042076587677002,
771
+ "learning_rate": 1.552153572294018e-05,
772
+ "loss": 1.0622,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 0.33788972508063275,
777
+ "grad_norm": 1.2498126029968262,
778
+ "learning_rate": 1.5438322804850762e-05,
779
+ "loss": 1.0422,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 0.34096144985409305,
784
+ "grad_norm": 1.2515864372253418,
785
+ "learning_rate": 1.5354571695186175e-05,
786
+ "loss": 1.0303,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 0.3440331746275534,
791
+ "grad_norm": 1.1558114290237427,
792
+ "learning_rate": 1.5270290682189556e-05,
793
+ "loss": 0.9732,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 0.3471048994010137,
798
+ "grad_norm": 1.3687546253204346,
799
+ "learning_rate": 1.5185488106544743e-05,
800
+ "loss": 1.0289,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 0.350176624174474,
805
+ "grad_norm": 1.5358903408050537,
806
+ "learning_rate": 1.5100172360550873e-05,
807
+ "loss": 1.0093,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 0.3532483489479343,
812
+ "grad_norm": 2.732478380203247,
813
+ "learning_rate": 1.5014351887291843e-05,
814
+ "loss": 1.0015,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 0.35632007372139457,
819
+ "grad_norm": 2.0520331859588623,
820
+ "learning_rate": 1.4928035179800772e-05,
821
+ "loss": 0.9988,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 0.35939179849485486,
826
+ "grad_norm": 1.1731293201446533,
827
+ "learning_rate": 1.48412307802195e-05,
828
+ "loss": 0.9697,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 0.36246352326831516,
833
+ "grad_norm": 1.8224886655807495,
834
+ "learning_rate": 1.4753947278953235e-05,
835
+ "loss": 1.0542,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 0.36553524804177545,
840
+ "grad_norm": 1.2448983192443848,
841
+ "learning_rate": 1.4666193313820418e-05,
842
+ "loss": 1.028,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 0.36860697281523574,
847
+ "grad_norm": 1.2823021411895752,
848
+ "learning_rate": 1.4577977569197915e-05,
849
+ "loss": 1.0719,
850
+ "step": 1200
851
+ },
852
+ {
853
+ "epoch": 0.37167869758869604,
854
+ "grad_norm": 1.0568265914916992,
855
+ "learning_rate": 1.4489308775161564e-05,
856
+ "loss": 1.0142,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 0.37475042236215633,
861
+ "grad_norm": 1.0901318788528442,
862
+ "learning_rate": 1.4400195706622247e-05,
863
+ "loss": 1.046,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 0.3778221471356166,
868
+ "grad_norm": 1.1522573232650757,
869
+ "learning_rate": 1.4310647182457475e-05,
870
+ "loss": 1.0342,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 0.3808938719090769,
875
+ "grad_norm": 1.0686454772949219,
876
+ "learning_rate": 1.422067206463868e-05,
877
+ "loss": 1.0486,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 0.38396559668253727,
882
+ "grad_norm": 1.1028980016708374,
883
+ "learning_rate": 1.413027925735417e-05,
884
+ "loss": 1.0219,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 0.38703732145599756,
889
+ "grad_norm": 1.297943353652954,
890
+ "learning_rate": 1.4039477706127982e-05,
891
+ "loss": 1.0349,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 0.39010904622945786,
896
+ "grad_norm": 1.081758737564087,
897
+ "learning_rate": 1.3948276396934578e-05,
898
+ "loss": 1.0313,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 0.39318077100291815,
903
+ "grad_norm": 1.3776434659957886,
904
+ "learning_rate": 1.3856684355309598e-05,
905
+ "loss": 0.9959,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 0.39625249577637844,
910
+ "grad_norm": 1.3725264072418213,
911
+ "learning_rate": 1.3764710645456639e-05,
912
+ "loss": 1.0351,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 0.39932422054983874,
917
+ "grad_norm": 1.3018957376480103,
918
+ "learning_rate": 1.3672364369350254e-05,
919
+ "loss": 1.0865,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 0.40239594532329903,
924
+ "grad_norm": 1.0644463300704956,
925
+ "learning_rate": 1.357965466583519e-05,
926
+ "loss": 1.026,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 0.4054676700967593,
931
+ "grad_norm": 1.2380294799804688,
932
+ "learning_rate": 1.348659070972199e-05,
933
+ "loss": 1.0132,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 0.4085393948702196,
938
+ "grad_norm": 1.39878249168396,
939
+ "learning_rate": 1.3393181710879002e-05,
940
+ "loss": 1.0792,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 0.4116111196436799,
945
+ "grad_norm": 1.2087831497192383,
946
+ "learning_rate": 1.3299436913320982e-05,
947
+ "loss": 1.0377,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 0.4146828444171402,
952
+ "grad_norm": 1.3151507377624512,
953
+ "learning_rate": 1.3205365594294257e-05,
954
+ "loss": 1.0384,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 0.4177545691906005,
959
+ "grad_norm": 1.2046235799789429,
960
+ "learning_rate": 1.3110977063358626e-05,
961
+ "loss": 0.996,
962
+ "step": 1360
963
+ },
964
+ {
965
+ "epoch": 0.4208262939640608,
966
+ "grad_norm": 1.3252286911010742,
967
+ "learning_rate": 1.3016280661466063e-05,
968
+ "loss": 1.0521,
969
+ "step": 1370
970
+ },
971
+ {
972
+ "epoch": 0.42389801873752114,
973
+ "grad_norm": 1.315597653388977,
974
+ "learning_rate": 1.292128576003631e-05,
975
+ "loss": 0.984,
976
+ "step": 1380
977
+ },
978
+ {
979
+ "epoch": 0.42696974351098144,
980
+ "grad_norm": 1.131601095199585,
981
+ "learning_rate": 1.2826001760029453e-05,
982
+ "loss": 1.064,
983
+ "step": 1390
984
+ },
985
+ {
986
+ "epoch": 0.43004146828444173,
987
+ "grad_norm": 1.1241368055343628,
988
+ "learning_rate": 1.2730438091015587e-05,
989
+ "loss": 1.0045,
990
+ "step": 1400
991
+ },
992
+ {
993
+ "epoch": 0.433113193057902,
994
+ "grad_norm": 1.1869995594024658,
995
+ "learning_rate": 1.263460421024162e-05,
996
+ "loss": 1.0165,
997
+ "step": 1410
998
+ },
999
+ {
1000
+ "epoch": 0.4361849178313623,
1001
+ "grad_norm": 1.1647521257400513,
1002
+ "learning_rate": 1.2538509601695372e-05,
1003
+ "loss": 0.9639,
1004
+ "step": 1420
1005
+ },
1006
+ {
1007
+ "epoch": 0.4392566426048226,
1008
+ "grad_norm": 1.3259695768356323,
1009
+ "learning_rate": 1.2442163775167009e-05,
1010
+ "loss": 1.0209,
1011
+ "step": 1430
1012
+ },
1013
+ {
1014
+ "epoch": 0.4423283673782829,
1015
+ "grad_norm": 1.061746597290039,
1016
+ "learning_rate": 1.2345576265307934e-05,
1017
+ "loss": 1.0044,
1018
+ "step": 1440
1019
+ },
1020
+ {
1021
+ "epoch": 0.4454000921517432,
1022
+ "grad_norm": 1.3238203525543213,
1023
+ "learning_rate": 1.2248756630687204e-05,
1024
+ "loss": 1.0122,
1025
+ "step": 1450
1026
+ },
1027
+ {
1028
+ "epoch": 0.4484718169252035,
1029
+ "grad_norm": 1.1721187829971313,
1030
+ "learning_rate": 1.2151714452845582e-05,
1031
+ "loss": 1.0814,
1032
+ "step": 1460
1033
+ },
1034
+ {
1035
+ "epoch": 0.4515435416986638,
1036
+ "grad_norm": 1.0964605808258057,
1037
+ "learning_rate": 1.2054459335347333e-05,
1038
+ "loss": 1.0629,
1039
+ "step": 1470
1040
+ },
1041
+ {
1042
+ "epoch": 0.4546152664721241,
1043
+ "grad_norm": 1.2595895528793335,
1044
+ "learning_rate": 1.1957000902829827e-05,
1045
+ "loss": 1.0172,
1046
+ "step": 1480
1047
+ },
1048
+ {
1049
+ "epoch": 0.4576869912455844,
1050
+ "grad_norm": 1.1863325834274292,
1051
+ "learning_rate": 1.1859348800051047e-05,
1052
+ "loss": 0.9903,
1053
+ "step": 1490
1054
+ },
1055
+ {
1056
+ "epoch": 0.46075871601904467,
1057
+ "grad_norm": 1.2627488374710083,
1058
+ "learning_rate": 1.1761512690935135e-05,
1059
+ "loss": 1.0419,
1060
+ "step": 1500
1061
+ },
1062
+ {
1063
+ "epoch": 0.463830440792505,
1064
+ "grad_norm": 1.2958710193634033,
1065
+ "learning_rate": 1.1663502257616002e-05,
1066
+ "loss": 0.9637,
1067
+ "step": 1510
1068
+ },
1069
+ {
1070
+ "epoch": 0.4669021655659653,
1071
+ "grad_norm": 1.5594003200531006,
1072
+ "learning_rate": 1.1565327199479173e-05,
1073
+ "loss": 0.9793,
1074
+ "step": 1520
1075
+ },
1076
+ {
1077
+ "epoch": 0.4699738903394256,
1078
+ "grad_norm": 1.2414544820785522,
1079
+ "learning_rate": 1.1466997232201901e-05,
1080
+ "loss": 1.0322,
1081
+ "step": 1530
1082
+ },
1083
+ {
1084
+ "epoch": 0.4730456151128859,
1085
+ "grad_norm": 1.5642577409744263,
1086
+ "learning_rate": 1.1368522086791688e-05,
1087
+ "loss": 1.0769,
1088
+ "step": 1540
1089
+ },
1090
+ {
1091
+ "epoch": 0.4761173398863462,
1092
+ "grad_norm": 1.3605339527130127,
1093
+ "learning_rate": 1.1269911508623255e-05,
1094
+ "loss": 0.9849,
1095
+ "step": 1550
1096
+ },
1097
+ {
1098
+ "epoch": 0.4791890646598065,
1099
+ "grad_norm": 1.3629966974258423,
1100
+ "learning_rate": 1.1171175256474137e-05,
1101
+ "loss": 1.0511,
1102
+ "step": 1560
1103
+ },
1104
+ {
1105
+ "epoch": 0.4822607894332668,
1106
+ "grad_norm": 1.4438694715499878,
1107
+ "learning_rate": 1.1072323101558908e-05,
1108
+ "loss": 0.9724,
1109
+ "step": 1570
1110
+ },
1111
+ {
1112
+ "epoch": 0.4853325142067271,
1113
+ "grad_norm": 1.2920620441436768,
1114
+ "learning_rate": 1.0973364826562208e-05,
1115
+ "loss": 1.0242,
1116
+ "step": 1580
1117
+ },
1118
+ {
1119
+ "epoch": 0.48840423898018737,
1120
+ "grad_norm": 1.1293901205062866,
1121
+ "learning_rate": 1.0874310224670615e-05,
1122
+ "loss": 0.9828,
1123
+ "step": 1590
1124
+ },
1125
+ {
1126
+ "epoch": 0.49147596375364766,
1127
+ "grad_norm": 1.3501414060592651,
1128
+ "learning_rate": 1.0775169098603487e-05,
1129
+ "loss": 1.0432,
1130
+ "step": 1600
1131
+ },
1132
+ {
1133
+ "epoch": 0.49454768852710795,
1134
+ "grad_norm": 1.3344357013702393,
1135
+ "learning_rate": 1.0675951259642846e-05,
1136
+ "loss": 1.0102,
1137
+ "step": 1610
1138
+ },
1139
+ {
1140
+ "epoch": 0.49761941330056825,
1141
+ "grad_norm": 1.354965090751648,
1142
+ "learning_rate": 1.0576666526662447e-05,
1143
+ "loss": 0.9579,
1144
+ "step": 1620
1145
+ },
1146
+ {
1147
+ "epoch": 0.5006911380740285,
1148
+ "grad_norm": 1.2013002634048462,
1149
+ "learning_rate": 1.0477324725156058e-05,
1150
+ "loss": 1.0215,
1151
+ "step": 1630
1152
+ },
1153
+ {
1154
+ "epoch": 0.5037628628474888,
1155
+ "grad_norm": 1.5074472427368164,
1156
+ "learning_rate": 1.037793568626511e-05,
1157
+ "loss": 1.0077,
1158
+ "step": 1640
1159
+ },
1160
+ {
1161
+ "epoch": 0.5068345876209491,
1162
+ "grad_norm": 1.3318787813186646,
1163
+ "learning_rate": 1.0278509245805774e-05,
1164
+ "loss": 1.0502,
1165
+ "step": 1650
1166
+ },
1167
+ {
1168
+ "epoch": 0.5099063123944094,
1169
+ "grad_norm": 1.3787329196929932,
1170
+ "learning_rate": 1.0179055243295587e-05,
1171
+ "loss": 1.0129,
1172
+ "step": 1660
1173
+ },
1174
+ {
1175
+ "epoch": 0.5129780371678697,
1176
+ "grad_norm": 1.1923975944519043,
1177
+ "learning_rate": 1.0079583520979694e-05,
1178
+ "loss": 1.0129,
1179
+ "step": 1670
1180
+ },
1181
+ {
1182
+ "epoch": 0.51604976194133,
1183
+ "grad_norm": 1.707875370979309,
1184
+ "learning_rate": 9.980103922856861e-06,
1185
+ "loss": 1.0259,
1186
+ "step": 1680
1187
+ },
1188
+ {
1189
+ "epoch": 0.5191214867147903,
1190
+ "grad_norm": 1.3130182027816772,
1191
+ "learning_rate": 9.880626293705255e-06,
1192
+ "loss": 0.9913,
1193
+ "step": 1690
1194
+ },
1195
+ {
1196
+ "epoch": 0.5221932114882507,
1197
+ "grad_norm": 2.6558756828308105,
1198
+ "learning_rate": 9.781160478108177e-06,
1199
+ "loss": 0.9719,
1200
+ "step": 1700
1201
+ },
1202
+ {
1203
+ "epoch": 0.525264936261711,
1204
+ "grad_norm": 1.4856926202774048,
1205
+ "learning_rate": 9.68171631947984e-06,
1206
+ "loss": 1.0207,
1207
+ "step": 1710
1208
+ },
1209
+ {
1210
+ "epoch": 0.5283366610351713,
1211
+ "grad_norm": 1.590119481086731,
1212
+ "learning_rate": 9.582303659091222e-06,
1213
+ "loss": 1.023,
1214
+ "step": 1720
1215
+ },
1216
+ {
1217
+ "epoch": 0.5314083858086316,
1218
+ "grad_norm": 1.4528511762619019,
1219
+ "learning_rate": 9.482932335096144e-06,
1220
+ "loss": 1.0007,
1221
+ "step": 1730
1222
+ },
1223
+ {
1224
+ "epoch": 0.5344801105820919,
1225
+ "grad_norm": 1.351442813873291,
1226
+ "learning_rate": 9.38361218155766e-06,
1227
+ "loss": 0.9612,
1228
+ "step": 1740
1229
+ },
1230
+ {
1231
+ "epoch": 0.5375518353555522,
1232
+ "grad_norm": 1.8360393047332764,
1233
+ "learning_rate": 9.28435302747486e-06,
1234
+ "loss": 1.0286,
1235
+ "step": 1750
1236
+ },
1237
+ {
1238
+ "epoch": 0.5406235601290125,
1239
+ "grad_norm": 1.3718026876449585,
1240
+ "learning_rate": 9.18516469581015e-06,
1241
+ "loss": 1.0251,
1242
+ "step": 1760
1243
+ },
1244
+ {
1245
+ "epoch": 0.5436952849024728,
1246
+ "grad_norm": 3.648052930831909,
1247
+ "learning_rate": 9.086057002517163e-06,
1248
+ "loss": 1.0119,
1249
+ "step": 1770
1250
+ },
1251
+ {
1252
+ "epoch": 0.5467670096759331,
1253
+ "grad_norm": 1.5079731941223145,
1254
+ "learning_rate": 8.98703975556932e-06,
1255
+ "loss": 1.0646,
1256
+ "step": 1780
1257
+ },
1258
+ {
1259
+ "epoch": 0.5498387344493934,
1260
+ "grad_norm": 1.3732868432998657,
1261
+ "learning_rate": 8.88812275398923e-06,
1262
+ "loss": 1.0119,
1263
+ "step": 1790
1264
+ },
1265
+ {
1266
+ "epoch": 0.5529104592228536,
1267
+ "grad_norm": 1.21127450466156,
1268
+ "learning_rate": 8.789315786878936e-06,
1269
+ "loss": 0.9586,
1270
+ "step": 1800
1271
+ },
1272
+ {
1273
+ "epoch": 0.5559821839963139,
1274
+ "grad_norm": 1.5817351341247559,
1275
+ "learning_rate": 8.69062863245117e-06,
1276
+ "loss": 1.0425,
1277
+ "step": 1810
1278
+ },
1279
+ {
1280
+ "epoch": 0.5590539087697742,
1281
+ "grad_norm": 1.689554214477539,
1282
+ "learning_rate": 8.59207105706166e-06,
1283
+ "loss": 1.0406,
1284
+ "step": 1820
1285
+ },
1286
+ {
1287
+ "epoch": 0.5621256335432345,
1288
+ "grad_norm": 1.2953405380249023,
1289
+ "learning_rate": 8.493652814242632e-06,
1290
+ "loss": 0.9987,
1291
+ "step": 1830
1292
+ },
1293
+ {
1294
+ "epoch": 0.5651973583166948,
1295
+ "grad_norm": 1.519486904144287,
1296
+ "learning_rate": 8.395383643737575e-06,
1297
+ "loss": 0.9818,
1298
+ "step": 1840
1299
+ },
1300
+ {
1301
+ "epoch": 0.5682690830901551,
1302
+ "grad_norm": 1.1716630458831787,
1303
+ "learning_rate": 8.297273270537372e-06,
1304
+ "loss": 0.9914,
1305
+ "step": 1850
1306
+ },
1307
+ {
1308
+ "epoch": 0.5713408078636154,
1309
+ "grad_norm": 1.412215232849121,
1310
+ "learning_rate": 8.199331403917869e-06,
1311
+ "loss": 0.9952,
1312
+ "step": 1860
1313
+ },
1314
+ {
1315
+ "epoch": 0.5744125326370757,
1316
+ "grad_norm": 1.7365607023239136,
1317
+ "learning_rate": 8.101567736479044e-06,
1318
+ "loss": 0.9934,
1319
+ "step": 1870
1320
+ },
1321
+ {
1322
+ "epoch": 0.577484257410536,
1323
+ "grad_norm": 1.4288475513458252,
1324
+ "learning_rate": 8.003991943185778e-06,
1325
+ "loss": 1.0203,
1326
+ "step": 1880
1327
+ },
1328
+ {
1329
+ "epoch": 0.5805559821839963,
1330
+ "grad_norm": 1.3489760160446167,
1331
+ "learning_rate": 7.906613680410415e-06,
1332
+ "loss": 0.9778,
1333
+ "step": 1890
1334
+ },
1335
+ {
1336
+ "epoch": 0.5836277069574566,
1337
+ "grad_norm": 1.46076238155365,
1338
+ "learning_rate": 7.809442584977113e-06,
1339
+ "loss": 0.9823,
1340
+ "step": 1900
1341
+ },
1342
+ {
1343
+ "epoch": 0.5866994317309169,
1344
+ "grad_norm": 1.5136983394622803,
1345
+ "learning_rate": 7.712488273208183e-06,
1346
+ "loss": 1.0759,
1347
+ "step": 1910
1348
+ },
1349
+ {
1350
+ "epoch": 0.5897711565043772,
1351
+ "grad_norm": 1.2395247220993042,
1352
+ "learning_rate": 7.615760339972421e-06,
1353
+ "loss": 0.9722,
1354
+ "step": 1920
1355
+ },
1356
+ {
1357
+ "epoch": 0.5928428812778375,
1358
+ "grad_norm": 1.1349307298660278,
1359
+ "learning_rate": 7.519268357735574e-06,
1360
+ "loss": 0.9651,
1361
+ "step": 1930
1362
+ },
1363
+ {
1364
+ "epoch": 0.5959146060512978,
1365
+ "grad_norm": 1.512542963027954,
1366
+ "learning_rate": 7.423021875613009e-06,
1367
+ "loss": 0.9941,
1368
+ "step": 1940
1369
+ },
1370
+ {
1371
+ "epoch": 0.598986330824758,
1372
+ "grad_norm": 1.5082643032073975,
1373
+ "learning_rate": 7.32703041842473e-06,
1374
+ "loss": 1.0251,
1375
+ "step": 1950
1376
+ },
1377
+ {
1378
+ "epoch": 0.6020580555982183,
1379
+ "grad_norm": 1.27542245388031,
1380
+ "learning_rate": 7.231303485752756e-06,
1381
+ "loss": 1.0445,
1382
+ "step": 1960
1383
+ },
1384
+ {
1385
+ "epoch": 0.6051297803716787,
1386
+ "grad_norm": 1.3753620386123657,
1387
+ "learning_rate": 7.135850551001034e-06,
1388
+ "loss": 0.9565,
1389
+ "step": 1970
1390
+ },
1391
+ {
1392
+ "epoch": 0.608201505145139,
1393
+ "grad_norm": 1.503854513168335,
1394
+ "learning_rate": 7.040681060457895e-06,
1395
+ "loss": 1.0124,
1396
+ "step": 1980
1397
+ },
1398
+ {
1399
+ "epoch": 0.6112732299185993,
1400
+ "grad_norm": 1.3910346031188965,
1401
+ "learning_rate": 6.9458044323612575e-06,
1402
+ "loss": 1.0008,
1403
+ "step": 1990
1404
+ },
1405
+ {
1406
+ "epoch": 0.6143449546920596,
1407
+ "grad_norm": 1.2628332376480103,
1408
+ "learning_rate": 6.851230055966549e-06,
1409
+ "loss": 0.9856,
1410
+ "step": 2000
1411
+ },
1412
+ {
1413
+ "epoch": 0.6174166794655199,
1414
+ "grad_norm": 1.8083914518356323,
1415
+ "learning_rate": 6.756967290617533e-06,
1416
+ "loss": 0.9701,
1417
+ "step": 2010
1418
+ },
1419
+ {
1420
+ "epoch": 0.6204884042389802,
1421
+ "grad_norm": 1.4478784799575806,
1422
+ "learning_rate": 6.6630254648200656e-06,
1423
+ "loss": 0.9418,
1424
+ "step": 2020
1425
+ },
1426
+ {
1427
+ "epoch": 0.6235601290124405,
1428
+ "grad_norm": 2.209411859512329,
1429
+ "learning_rate": 6.569413875318937e-06,
1430
+ "loss": 0.9841,
1431
+ "step": 2030
1432
+ },
1433
+ {
1434
+ "epoch": 0.6266318537859008,
1435
+ "grad_norm": 1.5672208070755005,
1436
+ "learning_rate": 6.4761417861778366e-06,
1437
+ "loss": 0.9717,
1438
+ "step": 2040
1439
+ },
1440
+ {
1441
+ "epoch": 0.6297035785593611,
1442
+ "grad_norm": 1.6222248077392578,
1443
+ "learning_rate": 6.383218427862544e-06,
1444
+ "loss": 0.9686,
1445
+ "step": 2050
1446
+ },
1447
+ {
1448
+ "epoch": 0.6327753033328214,
1449
+ "grad_norm": 1.7131073474884033,
1450
+ "learning_rate": 6.290652996327471e-06,
1451
+ "loss": 1.0376,
1452
+ "step": 2060
1453
+ },
1454
+ {
1455
+ "epoch": 0.6358470281062817,
1456
+ "grad_norm": 1.3580713272094727,
1457
+ "learning_rate": 6.198454652105599e-06,
1458
+ "loss": 0.9885,
1459
+ "step": 2070
1460
+ },
1461
+ {
1462
+ "epoch": 0.638918752879742,
1463
+ "grad_norm": 1.482251524925232,
1464
+ "learning_rate": 6.106632519401924e-06,
1465
+ "loss": 1.0063,
1466
+ "step": 2080
1467
+ },
1468
+ {
1469
+ "epoch": 0.6419904776532023,
1470
+ "grad_norm": 1.4394768476486206,
1471
+ "learning_rate": 6.015195685190496e-06,
1472
+ "loss": 1.0049,
1473
+ "step": 2090
1474
+ },
1475
+ {
1476
+ "epoch": 0.6450622024266626,
1477
+ "grad_norm": 1.397321105003357,
1478
+ "learning_rate": 5.9241531983151604e-06,
1479
+ "loss": 1.061,
1480
+ "step": 2100
1481
+ },
1482
+ {
1483
+ "epoch": 0.6481339272001229,
1484
+ "grad_norm": 1.4152517318725586,
1485
+ "learning_rate": 5.833514068594053e-06,
1486
+ "loss": 1.0047,
1487
+ "step": 2110
1488
+ },
1489
+ {
1490
+ "epoch": 0.6512056519735832,
1491
+ "grad_norm": 1.4583706855773926,
1492
+ "learning_rate": 5.743287265927959e-06,
1493
+ "loss": 0.9749,
1494
+ "step": 2120
1495
+ },
1496
+ {
1497
+ "epoch": 0.6542773767470434,
1498
+ "grad_norm": 1.5237219333648682,
1499
+ "learning_rate": 5.65348171941263e-06,
1500
+ "loss": 1.0137,
1501
+ "step": 2130
1502
+ },
1503
+ {
1504
+ "epoch": 0.6573491015205037,
1505
+ "grad_norm": 1.4176579713821411,
1506
+ "learning_rate": 5.564106316455127e-06,
1507
+ "loss": 0.9807,
1508
+ "step": 2140
1509
+ },
1510
+ {
1511
+ "epoch": 0.660420826293964,
1512
+ "grad_norm": 1.3908677101135254,
1513
+ "learning_rate": 5.475169901894324e-06,
1514
+ "loss": 0.9511,
1515
+ "step": 2150
1516
+ },
1517
+ {
1518
+ "epoch": 0.6634925510674243,
1519
+ "grad_norm": 1.3458560705184937,
1520
+ "learning_rate": 5.386681277125565e-06,
1521
+ "loss": 1.0058,
1522
+ "step": 2160
1523
+ },
1524
+ {
1525
+ "epoch": 0.6665642758408846,
1526
+ "grad_norm": 1.272244930267334,
1527
+ "learning_rate": 5.298649199229671e-06,
1528
+ "loss": 0.9592,
1529
+ "step": 2170
1530
+ },
1531
+ {
1532
+ "epoch": 0.6696360006143449,
1533
+ "grad_norm": 1.8487739562988281,
1534
+ "learning_rate": 5.211082380106323e-06,
1535
+ "loss": 1.0428,
1536
+ "step": 2180
1537
+ },
1538
+ {
1539
+ "epoch": 0.6727077253878052,
1540
+ "grad_norm": 1.4018948078155518,
1541
+ "learning_rate": 5.123989485611881e-06,
1542
+ "loss": 1.0358,
1543
+ "step": 2190
1544
+ },
1545
+ {
1546
+ "epoch": 0.6757794501612655,
1547
+ "grad_norm": 2.0736992359161377,
1548
+ "learning_rate": 5.037379134701827e-06,
1549
+ "loss": 1.015,
1550
+ "step": 2200
1551
+ },
1552
+ {
1553
+ "epoch": 0.6788511749347258,
1554
+ "grad_norm": 1.3019697666168213,
1555
+ "learning_rate": 4.951259898577754e-06,
1556
+ "loss": 0.9984,
1557
+ "step": 2210
1558
+ },
1559
+ {
1560
+ "epoch": 0.6819228997081861,
1561
+ "grad_norm": 1.2767648696899414,
1562
+ "learning_rate": 4.865640299839193e-06,
1563
+ "loss": 1.0063,
1564
+ "step": 2220
1565
+ },
1566
+ {
1567
+ "epoch": 0.6849946244816465,
1568
+ "grad_norm": 1.8070132732391357,
1569
+ "learning_rate": 4.780528811640162e-06,
1570
+ "loss": 1.0371,
1571
+ "step": 2230
1572
+ },
1573
+ {
1574
+ "epoch": 0.6880663492551068,
1575
+ "grad_norm": 1.3197108507156372,
1576
+ "learning_rate": 4.69593385685064e-06,
1577
+ "loss": 1.0238,
1578
+ "step": 2240
1579
+ },
1580
+ {
1581
+ "epoch": 0.6911380740285671,
1582
+ "grad_norm": 1.4064984321594238,
1583
+ "learning_rate": 4.611863807223021e-06,
1584
+ "loss": 1.0283,
1585
+ "step": 2250
1586
+ },
1587
+ {
1588
+ "epoch": 0.6942097988020274,
1589
+ "grad_norm": 1.696387767791748,
1590
+ "learning_rate": 4.528326982563619e-06,
1591
+ "loss": 1.0002,
1592
+ "step": 2260
1593
+ },
1594
+ {
1595
+ "epoch": 0.6972815235754877,
1596
+ "grad_norm": 1.372271180152893,
1597
+ "learning_rate": 4.44533164990933e-06,
1598
+ "loss": 0.9299,
1599
+ "step": 2270
1600
+ },
1601
+ {
1602
+ "epoch": 0.700353248348948,
1603
+ "grad_norm": 1.4047715663909912,
1604
+ "learning_rate": 4.362886022709493e-06,
1605
+ "loss": 1.0112,
1606
+ "step": 2280
1607
+ },
1608
+ {
1609
+ "epoch": 0.7034249731224083,
1610
+ "grad_norm": 1.5955997705459595,
1611
+ "learning_rate": 4.280998260013043e-06,
1612
+ "loss": 0.9021,
1613
+ "step": 2290
1614
+ },
1615
+ {
1616
+ "epoch": 0.7064966978958686,
1617
+ "grad_norm": 1.2997745275497437,
1618
+ "learning_rate": 4.199676465661115e-06,
1619
+ "loss": 1.057,
1620
+ "step": 2300
1621
+ },
1622
+ {
1623
+ "epoch": 0.7095684226693288,
1624
+ "grad_norm": 1.4059574604034424,
1625
+ "learning_rate": 4.118928687485021e-06,
1626
+ "loss": 0.9787,
1627
+ "step": 2310
1628
+ },
1629
+ {
1630
+ "epoch": 0.7126401474427891,
1631
+ "grad_norm": 1.478273868560791,
1632
+ "learning_rate": 4.0387629165098485e-06,
1633
+ "loss": 0.9974,
1634
+ "step": 2320
1635
+ },
1636
+ {
1637
+ "epoch": 0.7157118722162494,
1638
+ "grad_norm": 1.3587843179702759,
1639
+ "learning_rate": 3.9591870861636214e-06,
1640
+ "loss": 1.0576,
1641
+ "step": 2330
1642
+ },
1643
+ {
1644
+ "epoch": 0.7187835969897097,
1645
+ "grad_norm": 1.487290620803833,
1646
+ "learning_rate": 3.880209071492195e-06,
1647
+ "loss": 0.999,
1648
+ "step": 2340
1649
+ },
1650
+ {
1651
+ "epoch": 0.72185532176317,
1652
+ "grad_norm": 1.4370648860931396,
1653
+ "learning_rate": 3.8018366883799263e-06,
1654
+ "loss": 1.0137,
1655
+ "step": 2350
1656
+ },
1657
+ {
1658
+ "epoch": 0.7249270465366303,
1659
+ "grad_norm": 1.2997723817825317,
1660
+ "learning_rate": 3.7240776927761825e-06,
1661
+ "loss": 0.9579,
1662
+ "step": 2360
1663
+ },
1664
+ {
1665
+ "epoch": 0.7279987713100906,
1666
+ "grad_norm": 1.533390760421753,
1667
+ "learning_rate": 3.6469397799277884e-06,
1668
+ "loss": 0.9849,
1669
+ "step": 2370
1670
+ },
1671
+ {
1672
+ "epoch": 0.7310704960835509,
1673
+ "grad_norm": 1.2424747943878174,
1674
+ "learning_rate": 3.5704305836175025e-06,
1675
+ "loss": 0.9922,
1676
+ "step": 2380
1677
+ },
1678
+ {
1679
+ "epoch": 0.7341422208570112,
1680
+ "grad_norm": 1.4729893207550049,
1681
+ "learning_rate": 3.4945576754085285e-06,
1682
+ "loss": 0.918,
1683
+ "step": 2390
1684
+ },
1685
+ {
1686
+ "epoch": 0.7372139456304715,
1687
+ "grad_norm": 1.9271929264068604,
1688
+ "learning_rate": 3.41932856389524e-06,
1689
+ "loss": 1.0052,
1690
+ "step": 2400
1691
+ },
1692
+ {
1693
+ "epoch": 0.7402856704039318,
1694
+ "grad_norm": 1.5550090074539185,
1695
+ "learning_rate": 3.344750693960088e-06,
1696
+ "loss": 0.9793,
1697
+ "step": 2410
1698
+ },
1699
+ {
1700
+ "epoch": 0.7433573951773921,
1701
+ "grad_norm": 1.2984689474105835,
1702
+ "learning_rate": 3.2708314460368417e-06,
1703
+ "loss": 1.0048,
1704
+ "step": 2420
1705
+ },
1706
+ {
1707
+ "epoch": 0.7464291199508524,
1708
+ "grad_norm": 1.3870617151260376,
1709
+ "learning_rate": 3.1975781353802095e-06,
1710
+ "loss": 0.9915,
1711
+ "step": 2430
1712
+ },
1713
+ {
1714
+ "epoch": 0.7495008447243127,
1715
+ "grad_norm": 1.4227845668792725,
1716
+ "learning_rate": 3.124998011341883e-06,
1717
+ "loss": 0.9949,
1718
+ "step": 2440
1719
+ },
1720
+ {
1721
+ "epoch": 0.752572569497773,
1722
+ "grad_norm": 1.5820941925048828,
1723
+ "learning_rate": 3.0530982566531374e-06,
1724
+ "loss": 0.9694,
1725
+ "step": 2450
1726
+ },
1727
+ {
1728
+ "epoch": 0.7556442942712333,
1729
+ "grad_norm": 2.8299295902252197,
1730
+ "learning_rate": 2.981885986713995e-06,
1731
+ "loss": 0.9964,
1732
+ "step": 2460
1733
+ },
1734
+ {
1735
+ "epoch": 0.7587160190446935,
1736
+ "grad_norm": 1.6240131855010986,
1737
+ "learning_rate": 2.911368248889078e-06,
1738
+ "loss": 0.9875,
1739
+ "step": 2470
1740
+ },
1741
+ {
1742
+ "epoch": 0.7617877438181538,
1743
+ "grad_norm": 1.4417290687561035,
1744
+ "learning_rate": 2.841552021810183e-06,
1745
+ "loss": 0.9615,
1746
+ "step": 2480
1747
+ },
1748
+ {
1749
+ "epoch": 0.7648594685916142,
1750
+ "grad_norm": 1.56830632686615,
1751
+ "learning_rate": 2.7724442146856266e-06,
1752
+ "loss": 0.9857,
1753
+ "step": 2490
1754
+ },
1755
+ {
1756
+ "epoch": 0.7679311933650745,
1757
+ "grad_norm": 1.7147358655929565,
1758
+ "learning_rate": 2.704051666616534e-06,
1759
+ "loss": 1.0245,
1760
+ "step": 2500
1761
+ },
1762
+ {
1763
+ "epoch": 0.7710029181385348,
1764
+ "grad_norm": 1.4760040044784546,
1765
+ "learning_rate": 2.6363811459199896e-06,
1766
+ "loss": 0.9871,
1767
+ "step": 2510
1768
+ },
1769
+ {
1770
+ "epoch": 0.7740746429119951,
1771
+ "grad_norm": 1.227241039276123,
1772
+ "learning_rate": 2.5694393494592475e-06,
1773
+ "loss": 0.9439,
1774
+ "step": 2520
1775
+ },
1776
+ {
1777
+ "epoch": 0.7771463676854554,
1778
+ "grad_norm": 1.5236525535583496,
1779
+ "learning_rate": 2.5032329019809733e-06,
1780
+ "loss": 0.9765,
1781
+ "step": 2530
1782
+ },
1783
+ {
1784
+ "epoch": 0.7802180924589157,
1785
+ "grad_norm": 1.4436039924621582,
1786
+ "learning_rate": 2.4377683554596465e-06,
1787
+ "loss": 1.0443,
1788
+ "step": 2540
1789
+ },
1790
+ {
1791
+ "epoch": 0.783289817232376,
1792
+ "grad_norm": 1.4698201417922974,
1793
+ "learning_rate": 2.3730521884491744e-06,
1794
+ "loss": 1.0435,
1795
+ "step": 2550
1796
+ },
1797
+ {
1798
+ "epoch": 0.7863615420058363,
1799
+ "grad_norm": 1.3048474788665771,
1800
+ "learning_rate": 2.3090908054417294e-06,
1801
+ "loss": 1.0267,
1802
+ "step": 2560
1803
+ },
1804
+ {
1805
+ "epoch": 0.7894332667792966,
1806
+ "grad_norm": 1.3633249998092651,
1807
+ "learning_rate": 2.24589053623396e-06,
1808
+ "loss": 0.9543,
1809
+ "step": 2570
1810
+ },
1811
+ {
1812
+ "epoch": 0.7925049915527569,
1813
+ "grad_norm": 1.4888864755630493,
1814
+ "learning_rate": 2.1834576353005786e-06,
1815
+ "loss": 0.9568,
1816
+ "step": 2580
1817
+ },
1818
+ {
1819
+ "epoch": 0.7955767163262172,
1820
+ "grad_norm": 1.5138416290283203,
1821
+ "learning_rate": 2.1217982811753855e-06,
1822
+ "loss": 1.006,
1823
+ "step": 2590
1824
+ },
1825
+ {
1826
+ "epoch": 0.7986484410996775,
1827
+ "grad_norm": 1.4148247241973877,
1828
+ "learning_rate": 2.0609185758398444e-06,
1829
+ "loss": 0.9452,
1830
+ "step": 2600
1831
+ },
1832
+ {
1833
+ "epoch": 0.8017201658731378,
1834
+ "grad_norm": 1.6830410957336426,
1835
+ "learning_rate": 2.0008245441191954e-06,
1836
+ "loss": 1.0418,
1837
+ "step": 2610
1838
+ },
1839
+ {
1840
+ "epoch": 0.8047918906465981,
1841
+ "grad_norm": 1.4986162185668945,
1842
+ "learning_rate": 1.9415221330862276e-06,
1843
+ "loss": 0.9715,
1844
+ "step": 2620
1845
+ },
1846
+ {
1847
+ "epoch": 0.8078636154200584,
1848
+ "grad_norm": 1.3727550506591797,
1849
+ "learning_rate": 1.8830172114727508e-06,
1850
+ "loss": 0.9621,
1851
+ "step": 2630
1852
+ },
1853
+ {
1854
+ "epoch": 0.8109353401935187,
1855
+ "grad_norm": 1.5304306745529175,
1856
+ "learning_rate": 1.8253155690887915e-06,
1857
+ "loss": 1.0468,
1858
+ "step": 2640
1859
+ },
1860
+ {
1861
+ "epoch": 0.8140070649669789,
1862
+ "grad_norm": 1.4285249710083008,
1863
+ "learning_rate": 1.768422916249626e-06,
1864
+ "loss": 1.0002,
1865
+ "step": 2650
1866
+ },
1867
+ {
1868
+ "epoch": 0.8170787897404392,
1869
+ "grad_norm": 1.4852147102355957,
1870
+ "learning_rate": 1.7123448832106793e-06,
1871
+ "loss": 0.9601,
1872
+ "step": 2660
1873
+ },
1874
+ {
1875
+ "epoch": 0.8201505145138995,
1876
+ "grad_norm": 1.228345274925232,
1877
+ "learning_rate": 1.6570870196103218e-06,
1878
+ "loss": 1.0149,
1879
+ "step": 2670
1880
+ },
1881
+ {
1882
+ "epoch": 0.8232222392873598,
1883
+ "grad_norm": 1.2908066511154175,
1884
+ "learning_rate": 1.6026547939206826e-06,
1885
+ "loss": 0.9846,
1886
+ "step": 2680
1887
+ },
1888
+ {
1889
+ "epoch": 0.8262939640608201,
1890
+ "grad_norm": 1.5527454614639282,
1891
+ "learning_rate": 1.5490535929064476e-06,
1892
+ "loss": 0.9648,
1893
+ "step": 2690
1894
+ },
1895
+ {
1896
+ "epoch": 0.8293656888342804,
1897
+ "grad_norm": 1.4668254852294922,
1898
+ "learning_rate": 1.4962887210917987e-06,
1899
+ "loss": 1.0775,
1900
+ "step": 2700
1901
+ },
1902
+ {
1903
+ "epoch": 0.8324374136077407,
1904
+ "grad_norm": 1.3337604999542236,
1905
+ "learning_rate": 1.444365400235448e-06,
1906
+ "loss": 0.9332,
1907
+ "step": 2710
1908
+ },
1909
+ {
1910
+ "epoch": 0.835509138381201,
1911
+ "grad_norm": 1.3232961893081665,
1912
+ "learning_rate": 1.3932887688138775e-06,
1913
+ "loss": 0.9896,
1914
+ "step": 2720
1915
+ },
1916
+ {
1917
+ "epoch": 0.8385808631546613,
1918
+ "grad_norm": 1.5601872205734253,
1919
+ "learning_rate": 1.3430638815128239e-06,
1920
+ "loss": 1.0039,
1921
+ "step": 2730
1922
+ },
1923
+ {
1924
+ "epoch": 0.8416525879281216,
1925
+ "grad_norm": 1.4922980070114136,
1926
+ "learning_rate": 1.2936957087270519e-06,
1927
+ "loss": 1.0156,
1928
+ "step": 2740
1929
+ },
1930
+ {
1931
+ "epoch": 0.844724312701582,
1932
+ "grad_norm": 1.5822229385375977,
1933
+ "learning_rate": 1.2451891360684764e-06,
1934
+ "loss": 0.9864,
1935
+ "step": 2750
1936
+ },
1937
+ {
1938
+ "epoch": 0.8477960374750423,
1939
+ "grad_norm": 1.2503588199615479,
1940
+ "learning_rate": 1.1975489638826609e-06,
1941
+ "loss": 0.9598,
1942
+ "step": 2760
1943
+ },
1944
+ {
1945
+ "epoch": 0.8508677622485026,
1946
+ "grad_norm": 1.364817500114441,
1947
+ "learning_rate": 1.1507799067737591e-06,
1948
+ "loss": 0.8933,
1949
+ "step": 2770
1950
+ },
1951
+ {
1952
+ "epoch": 0.8539394870219629,
1953
+ "grad_norm": 1.3507331609725952,
1954
+ "learning_rate": 1.1048865931379594e-06,
1955
+ "loss": 1.0543,
1956
+ "step": 2780
1957
+ },
1958
+ {
1959
+ "epoch": 0.8570112117954232,
1960
+ "grad_norm": 1.4149025678634644,
1961
+ "learning_rate": 1.059873564705427e-06,
1962
+ "loss": 0.9826,
1963
+ "step": 2790
1964
+ },
1965
+ {
1966
+ "epoch": 0.8600829365688835,
1967
+ "grad_norm": 1.4314866065979004,
1968
+ "learning_rate": 1.0157452760908604e-06,
1969
+ "loss": 1.0093,
1970
+ "step": 2800
1971
+ },
1972
+ {
1973
+ "epoch": 0.8631546613423438,
1974
+ "grad_norm": 1.4840718507766724,
1975
+ "learning_rate": 9.725060943526343e-07,
1976
+ "loss": 0.9949,
1977
+ "step": 2810
1978
+ },
1979
+ {
1980
+ "epoch": 0.866226386115804,
1981
+ "grad_norm": 1.3463382720947266,
1982
+ "learning_rate": 9.301602985606284e-07,
1983
+ "loss": 0.9574,
1984
+ "step": 2820
1985
+ },
1986
+ {
1987
+ "epoch": 0.8692981108892643,
1988
+ "grad_norm": 1.8905155658721924,
1989
+ "learning_rate": 8.887120793727677e-07,
1990
+ "loss": 1.0334,
1991
+ "step": 2830
1992
+ },
1993
+ {
1994
+ "epoch": 0.8723698356627246,
1995
+ "grad_norm": 1.4388149976730347,
1996
+ "learning_rate": 8.481655386202903e-07,
1997
+ "loss": 1.0218,
1998
+ "step": 2840
1999
+ },
2000
+ {
2001
+ "epoch": 0.8754415604361849,
2002
+ "grad_norm": 2.5872504711151123,
2003
+ "learning_rate": 8.08524688901825e-07,
2004
+ "loss": 1.0667,
2005
+ "step": 2850
2006
+ },
2007
+ {
2008
+ "epoch": 0.8785132852096452,
2009
+ "grad_norm": 1.5576486587524414,
2010
+ "learning_rate": 7.697934531862972e-07,
2011
+ "loss": 0.9662,
2012
+ "step": 2860
2013
+ },
2014
+ {
2015
+ "epoch": 0.8815850099831055,
2016
+ "grad_norm": 1.4606417417526245,
2017
+ "learning_rate": 7.319756644246878e-07,
2018
+ "loss": 0.9629,
2019
+ "step": 2870
2020
+ },
2021
+ {
2022
+ "epoch": 0.8846567347565658,
2023
+ "grad_norm": 1.4137325286865234,
2024
+ "learning_rate": 6.950750651707327e-07,
2025
+ "loss": 1.0736,
2026
+ "step": 2880
2027
+ },
2028
+ {
2029
+ "epoch": 0.8877284595300261,
2030
+ "grad_norm": 1.471315860748291,
2031
+ "learning_rate": 6.590953072105321e-07,
2032
+ "loss": 1.0666,
2033
+ "step": 2890
2034
+ },
2035
+ {
2036
+ "epoch": 0.8908001843034864,
2037
+ "grad_norm": 1.6415996551513672,
2038
+ "learning_rate": 6.240399512011664e-07,
2039
+ "loss": 1.0137,
2040
+ "step": 2900
2041
+ },
2042
+ {
2043
+ "epoch": 0.8938719090769467,
2044
+ "grad_norm": 1.6317228078842163,
2045
+ "learning_rate": 5.899124663183287e-07,
2046
+ "loss": 0.9415,
2047
+ "step": 2910
2048
+ },
2049
+ {
2050
+ "epoch": 0.896943633850407,
2051
+ "grad_norm": 1.4955772161483765,
2052
+ "learning_rate": 5.567162299129947e-07,
2053
+ "loss": 1.0241,
2054
+ "step": 2920
2055
+ },
2056
+ {
2057
+ "epoch": 0.9000153586238673,
2058
+ "grad_norm": 1.4452728033065796,
2059
+ "learning_rate": 5.244545271772016e-07,
2060
+ "loss": 0.9614,
2061
+ "step": 2930
2062
+ },
2063
+ {
2064
+ "epoch": 0.9030870833973276,
2065
+ "grad_norm": 1.4729427099227905,
2066
+ "learning_rate": 4.931305508189255e-07,
2067
+ "loss": 0.9674,
2068
+ "step": 2940
2069
+ },
2070
+ {
2071
+ "epoch": 0.9061588081707879,
2072
+ "grad_norm": 1.3615094423294067,
2073
+ "learning_rate": 4.6274740074613187e-07,
2074
+ "loss": 1.0474,
2075
+ "step": 2950
2076
+ },
2077
+ {
2078
+ "epoch": 0.9092305329442482,
2079
+ "grad_norm": 1.5536061525344849,
2080
+ "learning_rate": 4.33308083759999e-07,
2081
+ "loss": 0.9291,
2082
+ "step": 2960
2083
+ },
2084
+ {
2085
+ "epoch": 0.9123022577177085,
2086
+ "grad_norm": 1.582104206085205,
2087
+ "learning_rate": 4.0481551325734393e-07,
2088
+ "loss": 0.998,
2089
+ "step": 2970
2090
+ },
2091
+ {
2092
+ "epoch": 0.9153739824911687,
2093
+ "grad_norm": 1.5118730068206787,
2094
+ "learning_rate": 3.772725089423235e-07,
2095
+ "loss": 0.9718,
2096
+ "step": 2980
2097
+ },
2098
+ {
2099
+ "epoch": 0.918445707264629,
2100
+ "grad_norm": 1.7106198072433472,
2101
+ "learning_rate": 3.506817965473741e-07,
2102
+ "loss": 0.9625,
2103
+ "step": 2990
2104
+ },
2105
+ {
2106
+ "epoch": 0.9215174320380893,
2107
+ "grad_norm": 1.4147557020187378,
2108
+ "learning_rate": 3.2504600756347314e-07,
2109
+ "loss": 0.9654,
2110
+ "step": 3000
2111
+ },
2112
+ {
2113
+ "epoch": 0.9245891568115497,
2114
+ "grad_norm": 1.331018090248108,
2115
+ "learning_rate": 3.003676789797161e-07,
2116
+ "loss": 1.0091,
2117
+ "step": 3010
2118
+ },
2119
+ {
2120
+ "epoch": 0.92766088158501,
2121
+ "grad_norm": 1.653944969177246,
2122
+ "learning_rate": 2.7664925303224953e-07,
2123
+ "loss": 0.9399,
2124
+ "step": 3020
2125
+ },
2126
+ {
2127
+ "epoch": 0.9307326063584703,
2128
+ "grad_norm": 1.8000606298446655,
2129
+ "learning_rate": 2.5389307696258136e-07,
2130
+ "loss": 0.9671,
2131
+ "step": 3030
2132
+ },
2133
+ {
2134
+ "epoch": 0.9338043311319306,
2135
+ "grad_norm": 1.3829896450042725,
2136
+ "learning_rate": 2.321014027852908e-07,
2137
+ "loss": 1.0047,
2138
+ "step": 3040
2139
+ },
2140
+ {
2141
+ "epoch": 0.9368760559053909,
2142
+ "grad_norm": 1.5390887260437012,
2143
+ "learning_rate": 2.112763870651624e-07,
2144
+ "loss": 1.0249,
2145
+ "step": 3050
2146
+ },
2147
+ {
2148
+ "epoch": 0.9399477806788512,
2149
+ "grad_norm": 1.3523168563842773,
2150
+ "learning_rate": 1.9142009070377e-07,
2151
+ "loss": 0.922,
2152
+ "step": 3060
2153
+ },
2154
+ {
2155
+ "epoch": 0.9430195054523115,
2156
+ "grad_norm": 1.4008362293243408,
2157
+ "learning_rate": 1.7253447873551432e-07,
2158
+ "loss": 0.9748,
2159
+ "step": 3070
2160
+ },
2161
+ {
2162
+ "epoch": 0.9460912302257718,
2163
+ "grad_norm": 1.5816636085510254,
2164
+ "learning_rate": 1.5462142013317304e-07,
2165
+ "loss": 0.951,
2166
+ "step": 3080
2167
+ },
2168
+ {
2169
+ "epoch": 0.9491629549992321,
2170
+ "grad_norm": 1.5400006771087646,
2171
+ "learning_rate": 1.3768268762292537e-07,
2172
+ "loss": 0.9727,
2173
+ "step": 3090
2174
+ },
2175
+ {
2176
+ "epoch": 0.9522346797726924,
2177
+ "grad_norm": 1.4990243911743164,
2178
+ "learning_rate": 1.2171995750892896e-07,
2179
+ "loss": 1.0157,
2180
+ "step": 3100
2181
+ },
2182
+ {
2183
+ "epoch": 0.9553064045461527,
2184
+ "grad_norm": 1.3119021654129028,
2185
+ "learning_rate": 1.0673480950742831e-07,
2186
+ "loss": 1.0137,
2187
+ "step": 3110
2188
+ },
2189
+ {
2190
+ "epoch": 0.958378129319613,
2191
+ "grad_norm": 1.107062816619873,
2192
+ "learning_rate": 9.272872659041532e-08,
2193
+ "loss": 0.9122,
2194
+ "step": 3120
2195
+ },
2196
+ {
2197
+ "epoch": 0.9614498540930733,
2198
+ "grad_norm": 1.3335105180740356,
2199
+ "learning_rate": 7.970309483887329e-08,
2200
+ "loss": 0.9844,
2201
+ "step": 3130
2202
+ },
2203
+ {
2204
+ "epoch": 0.9645215788665336,
2205
+ "grad_norm": 1.968140721321106,
2206
+ "learning_rate": 6.765920330560894e-08,
2207
+ "loss": 1.0087,
2208
+ "step": 3140
2209
+ },
2210
+ {
2211
+ "epoch": 0.9675933036399939,
2212
+ "grad_norm": 1.6293946504592896,
2213
+ "learning_rate": 5.6598243887679984e-08,
2214
+ "loss": 0.9831,
2215
+ "step": 3150
2216
+ },
2217
+ {
2218
+ "epoch": 0.9706650284134541,
2219
+ "grad_norm": 1.341500163078308,
2220
+ "learning_rate": 4.652131120844727e-08,
2221
+ "loss": 1.016,
2222
+ "step": 3160
2223
+ },
2224
+ {
2225
+ "epoch": 0.9737367531869144,
2226
+ "grad_norm": 1.2992902994155884,
2227
+ "learning_rate": 3.74294025092381e-08,
2228
+ "loss": 0.9811,
2229
+ "step": 3170
2230
+ },
2231
+ {
2232
+ "epoch": 0.9768084779603747,
2233
+ "grad_norm": 1.4742531776428223,
2234
+ "learning_rate": 2.9323417550668475e-08,
2235
+ "loss": 0.9351,
2236
+ "step": 3180
2237
+ },
2238
+ {
2239
+ "epoch": 0.979880202733835,
2240
+ "grad_norm": 1.5771090984344482,
2241
+ "learning_rate": 2.2204158523592145e-08,
2242
+ "loss": 0.9744,
2243
+ "step": 3190
2244
+ },
2245
+ {
2246
+ "epoch": 0.9829519275072953,
2247
+ "grad_norm": 1.74678635597229,
2248
+ "learning_rate": 1.6072329969714085e-08,
2249
+ "loss": 0.9746,
2250
+ "step": 3200
2251
+ },
2252
+ {
2253
+ "epoch": 0.9860236522807556,
2254
+ "grad_norm": 1.7540942430496216,
2255
+ "learning_rate": 1.0928538711871828e-08,
2256
+ "loss": 0.9521,
2257
+ "step": 3210
2258
+ },
2259
+ {
2260
+ "epoch": 0.9890953770542159,
2261
+ "grad_norm": 1.5535780191421509,
2262
+ "learning_rate": 6.773293793976843e-09,
2263
+ "loss": 0.9847,
2264
+ "step": 3220
2265
+ },
2266
+ {
2267
+ "epoch": 0.9921671018276762,
2268
+ "grad_norm": 1.4685219526290894,
2269
+ "learning_rate": 3.607006430642601e-09,
2270
+ "loss": 0.9623,
2271
+ "step": 3230
2272
+ },
2273
+ {
2274
+ "epoch": 0.9952388266011365,
2275
+ "grad_norm": 1.4856065511703491,
2276
+ "learning_rate": 1.4299899664882432e-09,
2277
+ "loss": 0.9794,
2278
+ "step": 3240
2279
+ },
2280
+ {
2281
+ "epoch": 0.9983105513745968,
2282
+ "grad_norm": 1.4667905569076538,
2283
+ "learning_rate": 2.4245984512671905e-10,
2284
+ "loss": 0.9583,
2285
+ "step": 3250
2286
+ }
2287
+ ],
2288
+ "logging_steps": 10,
2289
+ "max_steps": 3256,
2290
+ "num_input_tokens_seen": 0,
2291
+ "num_train_epochs": 1,
2292
+ "save_steps": 200,
2293
+ "stateful_callbacks": {
2294
+ "TrainerControl": {
2295
+ "args": {
2296
+ "should_epoch_stop": false,
2297
+ "should_evaluate": false,
2298
+ "should_log": false,
2299
+ "should_save": true,
2300
+ "should_training_stop": true
2301
+ },
2302
+ "attributes": {}
2303
+ }
2304
+ },
2305
+ "total_flos": 2.8457671270829184e+17,
2306
+ "train_batch_size": 1,
2307
+ "trial_name": null,
2308
+ "trial_params": null
2309
+ }
checkpoint-3256/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:427f9ef7775ae19943d0ad770e4668bee57f9626ec8f56e6b943d9567f7a0704
3
+ size 6097
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|return|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|reserved_200017|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d54615cba5113384c7495974d13feabc433e3e27e9262ac6a1a77f762a48d1c8
3
+ size 27868273
tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd9f6c1cec89c00a50e0eea8e27f8feadc89a155d49368d7a76e7de1f462cff5
3
+ size 4229