aifeifei798 commited on
Commit
6656e66
·
verified ·
1 Parent(s): 2171048

Upload 17 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
adapter_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "./gemma-3-4b-it-qat-unsloth-bnb-4bit",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 16,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "o_proj",
29
+ "down_proj",
30
+ "q_proj",
31
+ "gate_proj",
32
+ "v_proj",
33
+ "k_proj",
34
+ "up_proj"
35
+ ],
36
+ "task_type": "CAUSAL_LM",
37
+ "trainable_token_indices": null,
38
+ "use_dora": false,
39
+ "use_qalora": false,
40
+ "use_rslora": false
41
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4997c1f8b557d838958b7aef19c340b68397365744e832989fcc224d00bcc5a
3
+ size 131252288
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e083f429974ebe562fcb34b65fb62acf98f6a8b4c3603dfd906b2105a44176a
3
+ size 62333579
preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_pan_and_scan": null,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "Gemma3ImageProcessor",
13
+ "image_seq_length": 256,
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "pan_and_scan_max_num_crops": null,
20
+ "pan_and_scan_min_crop_size": null,
21
+ "pan_and_scan_min_ratio_to_activate": null,
22
+ "processor_class": "Gemma3Processor",
23
+ "resample": 2,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "height": 896,
27
+ "width": 896
28
+ }
29
+ }
processor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 256,
3
+ "processor_class": "Gemma3Processor"
4
+ }
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1d565802a8e26c4e8a31328752b7a7fdc186d9401aa008e65697d0ad8c22e33
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6862e802a3e558b1042aa6b7ef87427ce4c4fa2eec06c7fbb6e6a22587b0b5e7
3
+ size 1465
special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<end_of_turn>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
3
+ size 33384568
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
trainer_state.json ADDED
@@ -0,0 +1,2421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.4465031016252844,
6
+ "eval_steps": 500,
7
+ "global_step": 3410,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0013093932598981946,
14
+ "grad_norm": 0.9729704260826111,
15
+ "learning_rate": 0.00018,
16
+ "loss": 2.9613,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.002618786519796389,
21
+ "grad_norm": 0.4988560676574707,
22
+ "learning_rate": 0.00019976402726796014,
23
+ "loss": 2.2501,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.003928179779694584,
28
+ "grad_norm": 0.3629654049873352,
29
+ "learning_rate": 0.0001995018353434714,
30
+ "loss": 1.9558,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.005237573039592778,
35
+ "grad_norm": 0.42317306995391846,
36
+ "learning_rate": 0.0001992396434189827,
37
+ "loss": 1.8904,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.006546966299490973,
42
+ "grad_norm": 0.4342662990093231,
43
+ "learning_rate": 0.00019897745149449398,
44
+ "loss": 1.9487,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.007856359559389167,
49
+ "grad_norm": 0.4164058268070221,
50
+ "learning_rate": 0.00019871525957000524,
51
+ "loss": 1.845,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.009165752819287363,
56
+ "grad_norm": 0.38950663805007935,
57
+ "learning_rate": 0.0001984530676455165,
58
+ "loss": 1.8264,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.010475146079185557,
63
+ "grad_norm": 0.42093154788017273,
64
+ "learning_rate": 0.00019819087572102778,
65
+ "loss": 1.8418,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.011784539339083753,
70
+ "grad_norm": 0.4716477394104004,
71
+ "learning_rate": 0.00019792868379653908,
72
+ "loss": 1.8346,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.013093932598981946,
77
+ "grad_norm": 0.4358816146850586,
78
+ "learning_rate": 0.00019766649187205035,
79
+ "loss": 1.8271,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.014403325858880142,
84
+ "grad_norm": 0.45478910207748413,
85
+ "learning_rate": 0.00019740429994756162,
86
+ "loss": 1.7506,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.015712719118778334,
91
+ "grad_norm": 0.4366815388202667,
92
+ "learning_rate": 0.00019714210802307289,
93
+ "loss": 1.7854,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.01702211237867653,
98
+ "grad_norm": 0.45096880197525024,
99
+ "learning_rate": 0.00019687991609858418,
100
+ "loss": 1.779,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.018331505638574726,
105
+ "grad_norm": 0.4566694498062134,
106
+ "learning_rate": 0.00019661772417409545,
107
+ "loss": 1.7509,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.01964089889847292,
112
+ "grad_norm": 0.4729042649269104,
113
+ "learning_rate": 0.00019635553224960672,
114
+ "loss": 1.7271,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.020950292158371114,
119
+ "grad_norm": 0.46566858887672424,
120
+ "learning_rate": 0.000196093340325118,
121
+ "loss": 1.714,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.02225968541826931,
126
+ "grad_norm": 0.45467349886894226,
127
+ "learning_rate": 0.00019583114840062926,
128
+ "loss": 1.702,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.023569078678167505,
133
+ "grad_norm": 0.434721440076828,
134
+ "learning_rate": 0.00019556895647614055,
135
+ "loss": 1.7162,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.024878471938065697,
140
+ "grad_norm": 0.5182896852493286,
141
+ "learning_rate": 0.00019530676455165182,
142
+ "loss": 1.688,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.026187865197963893,
147
+ "grad_norm": 0.5060753226280212,
148
+ "learning_rate": 0.0001950445726271631,
149
+ "loss": 1.6955,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.02749725845786209,
154
+ "grad_norm": 0.46147406101226807,
155
+ "learning_rate": 0.00019478238070267436,
156
+ "loss": 1.681,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.028806651717760284,
161
+ "grad_norm": 0.4517662823200226,
162
+ "learning_rate": 0.00019452018877818563,
163
+ "loss": 1.6936,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.030116044977658477,
168
+ "grad_norm": 0.44920527935028076,
169
+ "learning_rate": 0.00019425799685369693,
170
+ "loss": 1.6633,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.03142543823755667,
175
+ "grad_norm": 0.5066579580307007,
176
+ "learning_rate": 0.0001939958049292082,
177
+ "loss": 1.6872,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.03273483149745487,
182
+ "grad_norm": 0.5238184928894043,
183
+ "learning_rate": 0.00019373361300471946,
184
+ "loss": 1.6255,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.03404422475735306,
189
+ "grad_norm": 0.4943958520889282,
190
+ "learning_rate": 0.00019347142108023073,
191
+ "loss": 1.6499,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.03535361801725126,
196
+ "grad_norm": 0.48346492648124695,
197
+ "learning_rate": 0.00019320922915574203,
198
+ "loss": 1.672,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.03666301127714945,
203
+ "grad_norm": 0.4401436746120453,
204
+ "learning_rate": 0.0001929470372312533,
205
+ "loss": 1.6863,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.037972404537047644,
210
+ "grad_norm": 0.4602312743663788,
211
+ "learning_rate": 0.00019268484530676457,
212
+ "loss": 1.646,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.03928179779694584,
217
+ "grad_norm": 0.4927528202533722,
218
+ "learning_rate": 0.00019242265338227584,
219
+ "loss": 1.6252,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.040591191056844035,
224
+ "grad_norm": 0.5075507760047913,
225
+ "learning_rate": 0.0001921604614577871,
226
+ "loss": 1.6218,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.04190058431674223,
231
+ "grad_norm": 0.5239428877830505,
232
+ "learning_rate": 0.0001918982695332984,
233
+ "loss": 1.6354,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.043209977576640426,
238
+ "grad_norm": 0.5954804420471191,
239
+ "learning_rate": 0.00019163607760880967,
240
+ "loss": 1.7022,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.04451937083653862,
245
+ "grad_norm": 0.5364096760749817,
246
+ "learning_rate": 0.00019137388568432094,
247
+ "loss": 1.5981,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.04582876409643681,
252
+ "grad_norm": 0.55096435546875,
253
+ "learning_rate": 0.0001911116937598322,
254
+ "loss": 1.6211,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.04713815735633501,
259
+ "grad_norm": 0.5193445682525635,
260
+ "learning_rate": 0.00019084950183534348,
261
+ "loss": 1.6195,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.0484475506162332,
266
+ "grad_norm": 0.528788685798645,
267
+ "learning_rate": 0.00019058730991085477,
268
+ "loss": 1.6076,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.049756943876131395,
273
+ "grad_norm": 0.5360815525054932,
274
+ "learning_rate": 0.00019032511798636604,
275
+ "loss": 1.5912,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.051066337136029594,
280
+ "grad_norm": 0.5031074285507202,
281
+ "learning_rate": 0.0001900629260618773,
282
+ "loss": 1.6157,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.052375730395927786,
287
+ "grad_norm": 0.5149925351142883,
288
+ "learning_rate": 0.00018980073413738858,
289
+ "loss": 1.579,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.053685123655825985,
294
+ "grad_norm": 0.5419250726699829,
295
+ "learning_rate": 0.00018953854221289985,
296
+ "loss": 1.6242,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.05499451691572418,
301
+ "grad_norm": 0.5513054728507996,
302
+ "learning_rate": 0.00018927635028841112,
303
+ "loss": 1.5948,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.05630391017562237,
308
+ "grad_norm": 0.5670781135559082,
309
+ "learning_rate": 0.0001890141583639224,
310
+ "loss": 1.5314,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.05761330343552057,
315
+ "grad_norm": 0.5327165722846985,
316
+ "learning_rate": 0.00018875196643943366,
317
+ "loss": 1.5716,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.05892269669541876,
322
+ "grad_norm": 0.5244112610816956,
323
+ "learning_rate": 0.00018848977451494493,
324
+ "loss": 1.5347,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.06023208995531695,
329
+ "grad_norm": 0.5349589586257935,
330
+ "learning_rate": 0.00018822758259045622,
331
+ "loss": 1.564,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.06154148321521515,
336
+ "grad_norm": 0.5296887755393982,
337
+ "learning_rate": 0.0001879653906659675,
338
+ "loss": 1.5779,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.06285087647511334,
343
+ "grad_norm": 0.5426337718963623,
344
+ "learning_rate": 0.00018770319874147876,
345
+ "loss": 1.5112,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.06416026973501154,
350
+ "grad_norm": 0.5532763004302979,
351
+ "learning_rate": 0.00018744100681699003,
352
+ "loss": 1.5458,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.06546966299490974,
357
+ "grad_norm": 0.5318668484687805,
358
+ "learning_rate": 0.00018717881489250133,
359
+ "loss": 1.5597,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.06677905625480793,
364
+ "grad_norm": 0.6084654331207275,
365
+ "learning_rate": 0.0001869166229680126,
366
+ "loss": 1.5485,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.06808844951470612,
371
+ "grad_norm": 0.5626131296157837,
372
+ "learning_rate": 0.00018665443104352386,
373
+ "loss": 1.5217,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.06939784277460431,
378
+ "grad_norm": 0.528758704662323,
379
+ "learning_rate": 0.00018639223911903513,
380
+ "loss": 1.5343,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.07070723603450252,
385
+ "grad_norm": 0.5894292593002319,
386
+ "learning_rate": 0.0001861300471945464,
387
+ "loss": 1.5604,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.07201662929440071,
392
+ "grad_norm": 0.5676683187484741,
393
+ "learning_rate": 0.0001858678552700577,
394
+ "loss": 1.5216,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.0733260225542989,
399
+ "grad_norm": 0.6381473541259766,
400
+ "learning_rate": 0.00018560566334556897,
401
+ "loss": 1.4334,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.0746354158141971,
406
+ "grad_norm": 0.6644160151481628,
407
+ "learning_rate": 0.00018534347142108024,
408
+ "loss": 1.4832,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.07594480907409529,
413
+ "grad_norm": 0.5856960415840149,
414
+ "learning_rate": 0.0001850812794965915,
415
+ "loss": 1.5118,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.07725420233399348,
420
+ "grad_norm": 0.5892801880836487,
421
+ "learning_rate": 0.00018481908757210277,
422
+ "loss": 1.5028,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.07856359559389169,
427
+ "grad_norm": 0.5674527883529663,
428
+ "learning_rate": 0.00018455689564761407,
429
+ "loss": 1.5125,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 0.07987298885378988,
434
+ "grad_norm": 0.6059868335723877,
435
+ "learning_rate": 0.00018429470372312534,
436
+ "loss": 1.4543,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.08118238211368807,
441
+ "grad_norm": 0.6255605816841125,
442
+ "learning_rate": 0.0001840325117986366,
443
+ "loss": 1.4851,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 0.08249177537358626,
448
+ "grad_norm": 0.5904423594474792,
449
+ "learning_rate": 0.00018377031987414788,
450
+ "loss": 1.4154,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 0.08380116863348445,
455
+ "grad_norm": 0.6035749912261963,
456
+ "learning_rate": 0.00018350812794965917,
457
+ "loss": 1.4276,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 0.08511056189338265,
462
+ "grad_norm": 0.597172737121582,
463
+ "learning_rate": 0.00018324593602517044,
464
+ "loss": 1.4736,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 0.08641995515328085,
469
+ "grad_norm": 0.6352164149284363,
470
+ "learning_rate": 0.0001829837441006817,
471
+ "loss": 1.4975,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 0.08772934841317905,
476
+ "grad_norm": 0.5500873327255249,
477
+ "learning_rate": 0.00018272155217619298,
478
+ "loss": 1.4578,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 0.08903874167307724,
483
+ "grad_norm": 0.6423613429069519,
484
+ "learning_rate": 0.00018245936025170425,
485
+ "loss": 1.3926,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 0.09034813493297543,
490
+ "grad_norm": 0.665908694267273,
491
+ "learning_rate": 0.00018219716832721555,
492
+ "loss": 1.4548,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 0.09165752819287362,
497
+ "grad_norm": 0.6354024410247803,
498
+ "learning_rate": 0.00018193497640272682,
499
+ "loss": 1.5,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 0.09296692145277183,
504
+ "grad_norm": 0.6588740348815918,
505
+ "learning_rate": 0.00018167278447823808,
506
+ "loss": 1.3609,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 0.09427631471267002,
511
+ "grad_norm": 0.6754702925682068,
512
+ "learning_rate": 0.00018141059255374935,
513
+ "loss": 1.3432,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 0.09558570797256821,
518
+ "grad_norm": 0.6337271332740784,
519
+ "learning_rate": 0.00018114840062926062,
520
+ "loss": 1.4439,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 0.0968951012324664,
525
+ "grad_norm": 0.6592088937759399,
526
+ "learning_rate": 0.00018088620870477192,
527
+ "loss": 1.3949,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 0.0982044944923646,
532
+ "grad_norm": 0.6700498461723328,
533
+ "learning_rate": 0.0001806240167802832,
534
+ "loss": 1.4046,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 0.09951388775226279,
539
+ "grad_norm": 0.708410382270813,
540
+ "learning_rate": 0.00018036182485579446,
541
+ "loss": 1.3021,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 0.100823281012161,
546
+ "grad_norm": 0.6718457937240601,
547
+ "learning_rate": 0.00018009963293130573,
548
+ "loss": 1.3769,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 0.10213267427205919,
553
+ "grad_norm": 0.661522388458252,
554
+ "learning_rate": 0.00017983744100681702,
555
+ "loss": 1.434,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 0.10344206753195738,
560
+ "grad_norm": 0.6615481376647949,
561
+ "learning_rate": 0.0001795752490823283,
562
+ "loss": 1.3839,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 0.10475146079185557,
567
+ "grad_norm": 0.696959376335144,
568
+ "learning_rate": 0.00017931305715783956,
569
+ "loss": 1.3634,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 0.10606085405175376,
574
+ "grad_norm": 0.7320592403411865,
575
+ "learning_rate": 0.00017905086523335083,
576
+ "loss": 1.2737,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 0.10737024731165197,
581
+ "grad_norm": 0.7200619578361511,
582
+ "learning_rate": 0.0001787886733088621,
583
+ "loss": 1.3732,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 0.10867964057155016,
588
+ "grad_norm": 0.6982961297035217,
589
+ "learning_rate": 0.00017852648138437337,
590
+ "loss": 1.3019,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 0.10998903383144835,
595
+ "grad_norm": 0.7427386045455933,
596
+ "learning_rate": 0.00017826428945988464,
597
+ "loss": 1.3398,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 0.11129842709134655,
602
+ "grad_norm": 0.7897806763648987,
603
+ "learning_rate": 0.0001780020975353959,
604
+ "loss": 1.3216,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 0.11260782035124474,
609
+ "grad_norm": 0.7520805597305298,
610
+ "learning_rate": 0.00017773990561090717,
611
+ "loss": 1.2875,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 0.11391721361114293,
616
+ "grad_norm": 0.7332555055618286,
617
+ "learning_rate": 0.00017747771368641844,
618
+ "loss": 1.272,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 0.11522660687104114,
623
+ "grad_norm": 0.7135840654373169,
624
+ "learning_rate": 0.00017721552176192974,
625
+ "loss": 1.3185,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 0.11653600013093933,
630
+ "grad_norm": 0.6898264288902283,
631
+ "learning_rate": 0.000176953329837441,
632
+ "loss": 1.3089,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 0.11784539339083752,
637
+ "grad_norm": 0.9488328099250793,
638
+ "learning_rate": 0.00017669113791295228,
639
+ "loss": 1.2258,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 0.11915478665073571,
644
+ "grad_norm": 0.7257933616638184,
645
+ "learning_rate": 0.00017642894598846355,
646
+ "loss": 1.3284,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 0.1204641799106339,
651
+ "grad_norm": 0.7688736915588379,
652
+ "learning_rate": 0.00017616675406397484,
653
+ "loss": 1.2878,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 0.1217735731705321,
658
+ "grad_norm": 0.8328510522842407,
659
+ "learning_rate": 0.0001759045621394861,
660
+ "loss": 1.2346,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 0.1230829664304303,
665
+ "grad_norm": 0.8448120951652527,
666
+ "learning_rate": 0.00017564237021499738,
667
+ "loss": 1.2926,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 0.1243923596903285,
672
+ "grad_norm": 0.8510689735412598,
673
+ "learning_rate": 0.00017538017829050865,
674
+ "loss": 1.2109,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 0.12570175295022668,
679
+ "grad_norm": 0.866874098777771,
680
+ "learning_rate": 0.00017511798636601992,
681
+ "loss": 1.3091,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 0.12701114621012488,
686
+ "grad_norm": 0.9010233879089355,
687
+ "learning_rate": 0.00017485579444153122,
688
+ "loss": 1.2273,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 0.1283205394700231,
693
+ "grad_norm": 0.9316047430038452,
694
+ "learning_rate": 0.00017459360251704248,
695
+ "loss": 1.2611,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 0.12962993272992127,
700
+ "grad_norm": 0.9005467295646667,
701
+ "learning_rate": 0.00017433141059255375,
702
+ "loss": 1.1747,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 0.13093932598981947,
707
+ "grad_norm": 0.8843415975570679,
708
+ "learning_rate": 0.00017406921866806502,
709
+ "loss": 1.1915,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 0.13224871924971765,
714
+ "grad_norm": 0.8090497851371765,
715
+ "learning_rate": 0.0001738070267435763,
716
+ "loss": 1.2452,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 0.13355811250961586,
721
+ "grad_norm": 1.2498819828033447,
722
+ "learning_rate": 0.0001735448348190876,
723
+ "loss": 1.276,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 0.13486750576951406,
728
+ "grad_norm": 0.7861034870147705,
729
+ "learning_rate": 0.00017328264289459886,
730
+ "loss": 1.1989,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 0.13617689902941224,
735
+ "grad_norm": 0.9525002837181091,
736
+ "learning_rate": 0.00017302045097011013,
737
+ "loss": 1.1338,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 0.13748629228931045,
742
+ "grad_norm": 0.8066142201423645,
743
+ "learning_rate": 0.0001727582590456214,
744
+ "loss": 1.1421,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 0.13879568554920862,
749
+ "grad_norm": 0.8200965523719788,
750
+ "learning_rate": 0.0001724960671211327,
751
+ "loss": 1.1596,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 0.14010507880910683,
756
+ "grad_norm": 0.9981400370597839,
757
+ "learning_rate": 0.00017223387519664396,
758
+ "loss": 1.0562,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 0.14141447206900504,
763
+ "grad_norm": 0.9273063540458679,
764
+ "learning_rate": 0.00017197168327215523,
765
+ "loss": 1.1275,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 0.14272386532890322,
770
+ "grad_norm": 0.8812237977981567,
771
+ "learning_rate": 0.0001717094913476665,
772
+ "loss": 1.0406,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 0.14403325858880142,
777
+ "grad_norm": 0.8970304727554321,
778
+ "learning_rate": 0.00017144729942317777,
779
+ "loss": 1.1263,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 0.1453426518486996,
784
+ "grad_norm": 0.9097404479980469,
785
+ "learning_rate": 0.00017118510749868906,
786
+ "loss": 1.1956,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 0.1466520451085978,
791
+ "grad_norm": 1.0246269702911377,
792
+ "learning_rate": 0.00017092291557420033,
793
+ "loss": 1.0717,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 0.14796143836849598,
798
+ "grad_norm": 1.1149781942367554,
799
+ "learning_rate": 0.0001706607236497116,
800
+ "loss": 1.076,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 0.1492708316283942,
805
+ "grad_norm": 1.1981500387191772,
806
+ "learning_rate": 0.00017039853172522287,
807
+ "loss": 1.142,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 0.1505802248882924,
812
+ "grad_norm": 0.9477318525314331,
813
+ "learning_rate": 0.00017013633980073414,
814
+ "loss": 1.0799,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 0.15188961814819057,
819
+ "grad_norm": 1.0102957487106323,
820
+ "learning_rate": 0.00016987414787624544,
821
+ "loss": 1.0531,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 0.15319901140808878,
826
+ "grad_norm": 1.1728227138519287,
827
+ "learning_rate": 0.0001696119559517567,
828
+ "loss": 1.0903,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 0.15450840466798696,
833
+ "grad_norm": 1.0086623430252075,
834
+ "learning_rate": 0.00016934976402726797,
835
+ "loss": 1.0677,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 0.15581779792788517,
840
+ "grad_norm": 0.8586070537567139,
841
+ "learning_rate": 0.00016908757210277924,
842
+ "loss": 1.1022,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 0.15712719118778337,
847
+ "grad_norm": 1.2628968954086304,
848
+ "learning_rate": 0.00016882538017829054,
849
+ "loss": 1.0575,
850
+ "step": 1200
851
+ },
852
+ {
853
+ "epoch": 0.15843658444768155,
854
+ "grad_norm": 0.9629563689231873,
855
+ "learning_rate": 0.0001685631882538018,
856
+ "loss": 1.0844,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 0.15974597770757976,
861
+ "grad_norm": 1.0898447036743164,
862
+ "learning_rate": 0.00016830099632931308,
863
+ "loss": 1.0654,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 0.16105537096747793,
868
+ "grad_norm": 1.13120698928833,
869
+ "learning_rate": 0.00016803880440482435,
870
+ "loss": 1.0686,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 0.16236476422737614,
875
+ "grad_norm": 1.0732567310333252,
876
+ "learning_rate": 0.00016777661248033561,
877
+ "loss": 1.084,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 0.16367415748727435,
882
+ "grad_norm": 1.0681878328323364,
883
+ "learning_rate": 0.00016751442055584688,
884
+ "loss": 0.9979,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 0.16498355074717252,
889
+ "grad_norm": 0.9773361086845398,
890
+ "learning_rate": 0.00016725222863135815,
891
+ "loss": 1.0841,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 0.16629294400707073,
896
+ "grad_norm": 1.0342450141906738,
897
+ "learning_rate": 0.00016699003670686942,
898
+ "loss": 1.0176,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 0.1676023372669689,
903
+ "grad_norm": 1.0580531358718872,
904
+ "learning_rate": 0.0001667278447823807,
905
+ "loss": 0.9858,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 0.16891173052686712,
910
+ "grad_norm": 0.9744387865066528,
911
+ "learning_rate": 0.000166465652857892,
912
+ "loss": 0.9282,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 0.1702211237867653,
917
+ "grad_norm": 0.9636452198028564,
918
+ "learning_rate": 0.00016620346093340326,
919
+ "loss": 0.9414,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 0.1715305170466635,
924
+ "grad_norm": 1.1029468774795532,
925
+ "learning_rate": 0.00016594126900891453,
926
+ "loss": 0.8812,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 0.1728399103065617,
931
+ "grad_norm": 1.2941449880599976,
932
+ "learning_rate": 0.0001656790770844258,
933
+ "loss": 0.9823,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 0.17414930356645988,
938
+ "grad_norm": 1.627166509628296,
939
+ "learning_rate": 0.00016541688515993706,
940
+ "loss": 0.9585,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 0.1754586968263581,
945
+ "grad_norm": 1.091630458831787,
946
+ "learning_rate": 0.00016515469323544836,
947
+ "loss": 0.9516,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 0.17676809008625627,
952
+ "grad_norm": 1.1108227968215942,
953
+ "learning_rate": 0.00016489250131095963,
954
+ "loss": 0.8998,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 0.17807748334615447,
959
+ "grad_norm": 1.0883326530456543,
960
+ "learning_rate": 0.0001646303093864709,
961
+ "loss": 0.916,
962
+ "step": 1360
963
+ },
964
+ {
965
+ "epoch": 0.17938687660605268,
966
+ "grad_norm": 1.2917275428771973,
967
+ "learning_rate": 0.00016436811746198217,
968
+ "loss": 0.9112,
969
+ "step": 1370
970
+ },
971
+ {
972
+ "epoch": 0.18069626986595086,
973
+ "grad_norm": 1.1828432083129883,
974
+ "learning_rate": 0.00016410592553749344,
975
+ "loss": 0.9721,
976
+ "step": 1380
977
+ },
978
+ {
979
+ "epoch": 0.18200566312584907,
980
+ "grad_norm": 1.3447389602661133,
981
+ "learning_rate": 0.00016384373361300473,
982
+ "loss": 0.9198,
983
+ "step": 1390
984
+ },
985
+ {
986
+ "epoch": 0.18331505638574724,
987
+ "grad_norm": 1.0735760927200317,
988
+ "learning_rate": 0.000163581541688516,
989
+ "loss": 0.8634,
990
+ "step": 1400
991
+ },
992
+ {
993
+ "epoch": 0.18462444964564545,
994
+ "grad_norm": 1.0454446077346802,
995
+ "learning_rate": 0.00016331934976402727,
996
+ "loss": 0.9151,
997
+ "step": 1410
998
+ },
999
+ {
1000
+ "epoch": 0.18593384290554366,
1001
+ "grad_norm": 1.2230719327926636,
1002
+ "learning_rate": 0.00016305715783953854,
1003
+ "loss": 0.9202,
1004
+ "step": 1420
1005
+ },
1006
+ {
1007
+ "epoch": 0.18724323616544183,
1008
+ "grad_norm": 1.1030149459838867,
1009
+ "learning_rate": 0.00016279496591504984,
1010
+ "loss": 0.9068,
1011
+ "step": 1430
1012
+ },
1013
+ {
1014
+ "epoch": 0.18855262942534004,
1015
+ "grad_norm": 1.4471871852874756,
1016
+ "learning_rate": 0.0001625327739905611,
1017
+ "loss": 0.8682,
1018
+ "step": 1440
1019
+ },
1020
+ {
1021
+ "epoch": 0.18986202268523822,
1022
+ "grad_norm": 1.2458796501159668,
1023
+ "learning_rate": 0.00016227058206607237,
1024
+ "loss": 0.8247,
1025
+ "step": 1450
1026
+ },
1027
+ {
1028
+ "epoch": 0.19117141594513642,
1029
+ "grad_norm": 1.1849644184112549,
1030
+ "learning_rate": 0.00016200839014158364,
1031
+ "loss": 0.8987,
1032
+ "step": 1460
1033
+ },
1034
+ {
1035
+ "epoch": 0.19248080920503463,
1036
+ "grad_norm": 1.2985557317733765,
1037
+ "learning_rate": 0.0001617461982170949,
1038
+ "loss": 0.8006,
1039
+ "step": 1470
1040
+ },
1041
+ {
1042
+ "epoch": 0.1937902024649328,
1043
+ "grad_norm": 1.7127928733825684,
1044
+ "learning_rate": 0.0001614840062926062,
1045
+ "loss": 0.8191,
1046
+ "step": 1480
1047
+ },
1048
+ {
1049
+ "epoch": 0.19509959572483102,
1050
+ "grad_norm": 1.440895915031433,
1051
+ "learning_rate": 0.00016122181436811748,
1052
+ "loss": 0.8129,
1053
+ "step": 1490
1054
+ },
1055
+ {
1056
+ "epoch": 0.1964089889847292,
1057
+ "grad_norm": 1.252194881439209,
1058
+ "learning_rate": 0.00016095962244362875,
1059
+ "loss": 0.8803,
1060
+ "step": 1500
1061
+ },
1062
+ {
1063
+ "epoch": 0.1977183822446274,
1064
+ "grad_norm": 1.138358235359192,
1065
+ "learning_rate": 0.00016069743051914001,
1066
+ "loss": 0.8744,
1067
+ "step": 1510
1068
+ },
1069
+ {
1070
+ "epoch": 0.19902777550452558,
1071
+ "grad_norm": 1.080971598625183,
1072
+ "learning_rate": 0.00016043523859465128,
1073
+ "loss": 0.8693,
1074
+ "step": 1520
1075
+ },
1076
+ {
1077
+ "epoch": 0.20033716876442378,
1078
+ "grad_norm": 1.1612547636032104,
1079
+ "learning_rate": 0.00016017304667016258,
1080
+ "loss": 0.7991,
1081
+ "step": 1530
1082
+ },
1083
+ {
1084
+ "epoch": 0.201646562024322,
1085
+ "grad_norm": 1.1773971319198608,
1086
+ "learning_rate": 0.00015991085474567385,
1087
+ "loss": 0.912,
1088
+ "step": 1540
1089
+ },
1090
+ {
1091
+ "epoch": 0.20295595528422017,
1092
+ "grad_norm": 1.1353998184204102,
1093
+ "learning_rate": 0.00015964866282118512,
1094
+ "loss": 0.7986,
1095
+ "step": 1550
1096
+ },
1097
+ {
1098
+ "epoch": 0.20426534854411837,
1099
+ "grad_norm": 1.6848335266113281,
1100
+ "learning_rate": 0.0001593864708966964,
1101
+ "loss": 0.6932,
1102
+ "step": 1560
1103
+ },
1104
+ {
1105
+ "epoch": 0.20557474180401655,
1106
+ "grad_norm": 1.4043173789978027,
1107
+ "learning_rate": 0.00015912427897220768,
1108
+ "loss": 0.8529,
1109
+ "step": 1570
1110
+ },
1111
+ {
1112
+ "epoch": 0.20688413506391476,
1113
+ "grad_norm": 1.2601439952850342,
1114
+ "learning_rate": 0.00015886208704771895,
1115
+ "loss": 0.8173,
1116
+ "step": 1580
1117
+ },
1118
+ {
1119
+ "epoch": 0.20819352832381297,
1120
+ "grad_norm": 1.2090034484863281,
1121
+ "learning_rate": 0.00015859989512323022,
1122
+ "loss": 0.7451,
1123
+ "step": 1590
1124
+ },
1125
+ {
1126
+ "epoch": 0.20950292158371114,
1127
+ "grad_norm": 1.3334815502166748,
1128
+ "learning_rate": 0.0001583377031987415,
1129
+ "loss": 0.775,
1130
+ "step": 1600
1131
+ },
1132
+ {
1133
+ "epoch": 0.21081231484360935,
1134
+ "grad_norm": 1.1993087530136108,
1135
+ "learning_rate": 0.00015807551127425276,
1136
+ "loss": 0.7733,
1137
+ "step": 1610
1138
+ },
1139
+ {
1140
+ "epoch": 0.21212170810350753,
1141
+ "grad_norm": 1.51642906665802,
1142
+ "learning_rate": 0.00015781331934976406,
1143
+ "loss": 0.6907,
1144
+ "step": 1620
1145
+ },
1146
+ {
1147
+ "epoch": 0.21343110136340573,
1148
+ "grad_norm": 1.3714466094970703,
1149
+ "learning_rate": 0.00015755112742527532,
1150
+ "loss": 0.7016,
1151
+ "step": 1630
1152
+ },
1153
+ {
1154
+ "epoch": 0.21474049462330394,
1155
+ "grad_norm": 1.2519642114639282,
1156
+ "learning_rate": 0.0001572889355007866,
1157
+ "loss": 0.7648,
1158
+ "step": 1640
1159
+ },
1160
+ {
1161
+ "epoch": 0.21604988788320212,
1162
+ "grad_norm": 1.3851202726364136,
1163
+ "learning_rate": 0.00015702674357629786,
1164
+ "loss": 0.7069,
1165
+ "step": 1650
1166
+ },
1167
+ {
1168
+ "epoch": 0.21735928114310032,
1169
+ "grad_norm": 1.334105134010315,
1170
+ "learning_rate": 0.00015676455165180913,
1171
+ "loss": 0.7338,
1172
+ "step": 1660
1173
+ },
1174
+ {
1175
+ "epoch": 0.2186686744029985,
1176
+ "grad_norm": 1.3785145282745361,
1177
+ "learning_rate": 0.0001565023597273204,
1178
+ "loss": 0.6299,
1179
+ "step": 1670
1180
+ },
1181
+ {
1182
+ "epoch": 0.2199780676628967,
1183
+ "grad_norm": 1.4771215915679932,
1184
+ "learning_rate": 0.00015624016780283167,
1185
+ "loss": 0.6828,
1186
+ "step": 1680
1187
+ },
1188
+ {
1189
+ "epoch": 0.2212874609227949,
1190
+ "grad_norm": 1.3885449171066284,
1191
+ "learning_rate": 0.00015597797587834294,
1192
+ "loss": 0.7141,
1193
+ "step": 1690
1194
+ },
1195
+ {
1196
+ "epoch": 0.2225968541826931,
1197
+ "grad_norm": 1.2664909362792969,
1198
+ "learning_rate": 0.00015571578395385423,
1199
+ "loss": 0.7667,
1200
+ "step": 1700
1201
+ },
1202
+ {
1203
+ "epoch": 0.2239062474425913,
1204
+ "grad_norm": 1.2576826810836792,
1205
+ "learning_rate": 0.0001554535920293655,
1206
+ "loss": 0.7395,
1207
+ "step": 1710
1208
+ },
1209
+ {
1210
+ "epoch": 0.22521564070248948,
1211
+ "grad_norm": 1.284826636314392,
1212
+ "learning_rate": 0.00015519140010487677,
1213
+ "loss": 0.6832,
1214
+ "step": 1720
1215
+ },
1216
+ {
1217
+ "epoch": 0.22652503396238768,
1218
+ "grad_norm": 1.272933006286621,
1219
+ "learning_rate": 0.00015492920818038804,
1220
+ "loss": 0.6892,
1221
+ "step": 1730
1222
+ },
1223
+ {
1224
+ "epoch": 0.22783442722228586,
1225
+ "grad_norm": 1.3465379476547241,
1226
+ "learning_rate": 0.0001546670162558993,
1227
+ "loss": 0.6449,
1228
+ "step": 1740
1229
+ },
1230
+ {
1231
+ "epoch": 0.22914382048218407,
1232
+ "grad_norm": 1.2862318754196167,
1233
+ "learning_rate": 0.00015440482433141058,
1234
+ "loss": 0.6883,
1235
+ "step": 1750
1236
+ },
1237
+ {
1238
+ "epoch": 0.23045321374208227,
1239
+ "grad_norm": 1.2469042539596558,
1240
+ "learning_rate": 0.00015414263240692188,
1241
+ "loss": 0.7593,
1242
+ "step": 1760
1243
+ },
1244
+ {
1245
+ "epoch": 0.23176260700198045,
1246
+ "grad_norm": 1.5080034732818604,
1247
+ "learning_rate": 0.00015388044048243315,
1248
+ "loss": 0.7009,
1249
+ "step": 1770
1250
+ },
1251
+ {
1252
+ "epoch": 0.23307200026187866,
1253
+ "grad_norm": 0.9788569211959839,
1254
+ "learning_rate": 0.00015361824855794441,
1255
+ "loss": 0.602,
1256
+ "step": 1780
1257
+ },
1258
+ {
1259
+ "epoch": 0.23438139352177684,
1260
+ "grad_norm": 1.3450673818588257,
1261
+ "learning_rate": 0.00015335605663345568,
1262
+ "loss": 0.6238,
1263
+ "step": 1790
1264
+ },
1265
+ {
1266
+ "epoch": 0.23569078678167504,
1267
+ "grad_norm": 1.4177800416946411,
1268
+ "learning_rate": 0.00015309386470896695,
1269
+ "loss": 0.6768,
1270
+ "step": 1800
1271
+ },
1272
+ {
1273
+ "epoch": 0.23700018004157325,
1274
+ "grad_norm": 1.3528062105178833,
1275
+ "learning_rate": 0.00015283167278447825,
1276
+ "loss": 0.6404,
1277
+ "step": 1810
1278
+ },
1279
+ {
1280
+ "epoch": 0.23830957330147143,
1281
+ "grad_norm": 1.2898012399673462,
1282
+ "learning_rate": 0.00015256948085998952,
1283
+ "loss": 0.6606,
1284
+ "step": 1820
1285
+ },
1286
+ {
1287
+ "epoch": 0.23961896656136963,
1288
+ "grad_norm": 1.311298131942749,
1289
+ "learning_rate": 0.0001523072889355008,
1290
+ "loss": 0.662,
1291
+ "step": 1830
1292
+ },
1293
+ {
1294
+ "epoch": 0.2409283598212678,
1295
+ "grad_norm": 1.6476584672927856,
1296
+ "learning_rate": 0.00015204509701101206,
1297
+ "loss": 0.671,
1298
+ "step": 1840
1299
+ },
1300
+ {
1301
+ "epoch": 0.24223775308116602,
1302
+ "grad_norm": 1.36719810962677,
1303
+ "learning_rate": 0.00015178290508652335,
1304
+ "loss": 0.7097,
1305
+ "step": 1850
1306
+ },
1307
+ {
1308
+ "epoch": 0.2435471463410642,
1309
+ "grad_norm": 1.3647184371948242,
1310
+ "learning_rate": 0.00015152071316203462,
1311
+ "loss": 0.6604,
1312
+ "step": 1860
1313
+ },
1314
+ {
1315
+ "epoch": 0.2448565396009624,
1316
+ "grad_norm": 1.2265934944152832,
1317
+ "learning_rate": 0.0001512585212375459,
1318
+ "loss": 0.6272,
1319
+ "step": 1870
1320
+ },
1321
+ {
1322
+ "epoch": 0.2461659328608606,
1323
+ "grad_norm": 1.4882850646972656,
1324
+ "learning_rate": 0.00015099632931305716,
1325
+ "loss": 0.7007,
1326
+ "step": 1880
1327
+ },
1328
+ {
1329
+ "epoch": 0.2474753261207588,
1330
+ "grad_norm": 1.408470869064331,
1331
+ "learning_rate": 0.00015073413738856843,
1332
+ "loss": 0.6526,
1333
+ "step": 1890
1334
+ },
1335
+ {
1336
+ "epoch": 0.248784719380657,
1337
+ "grad_norm": 1.3388913869857788,
1338
+ "learning_rate": 0.00015047194546407972,
1339
+ "loss": 0.6891,
1340
+ "step": 1900
1341
+ },
1342
+ {
1343
+ "epoch": 0.2500941126405552,
1344
+ "grad_norm": 1.3725926876068115,
1345
+ "learning_rate": 0.000150209753539591,
1346
+ "loss": 0.5763,
1347
+ "step": 1910
1348
+ },
1349
+ {
1350
+ "epoch": 0.25140350590045335,
1351
+ "grad_norm": 1.40208899974823,
1352
+ "learning_rate": 0.00014994756161510226,
1353
+ "loss": 0.5637,
1354
+ "step": 1920
1355
+ },
1356
+ {
1357
+ "epoch": 0.25271289916035156,
1358
+ "grad_norm": 1.8308840990066528,
1359
+ "learning_rate": 0.00014968536969061353,
1360
+ "loss": 0.6899,
1361
+ "step": 1930
1362
+ },
1363
+ {
1364
+ "epoch": 0.25402229242024976,
1365
+ "grad_norm": 1.4921183586120605,
1366
+ "learning_rate": 0.0001494231777661248,
1367
+ "loss": 0.5764,
1368
+ "step": 1940
1369
+ },
1370
+ {
1371
+ "epoch": 0.25533168568014797,
1372
+ "grad_norm": 1.5387523174285889,
1373
+ "learning_rate": 0.0001491609858416361,
1374
+ "loss": 0.5229,
1375
+ "step": 1950
1376
+ },
1377
+ {
1378
+ "epoch": 0.2566410789400462,
1379
+ "grad_norm": 1.3345798254013062,
1380
+ "learning_rate": 0.00014889879391714737,
1381
+ "loss": 0.5949,
1382
+ "step": 1960
1383
+ },
1384
+ {
1385
+ "epoch": 0.2579504721999443,
1386
+ "grad_norm": 1.682065486907959,
1387
+ "learning_rate": 0.00014863660199265863,
1388
+ "loss": 0.5619,
1389
+ "step": 1970
1390
+ },
1391
+ {
1392
+ "epoch": 0.25925986545984253,
1393
+ "grad_norm": 1.480276346206665,
1394
+ "learning_rate": 0.0001483744100681699,
1395
+ "loss": 0.5473,
1396
+ "step": 1980
1397
+ },
1398
+ {
1399
+ "epoch": 0.26056925871974074,
1400
+ "grad_norm": 1.3453810214996338,
1401
+ "learning_rate": 0.0001481122181436812,
1402
+ "loss": 0.5603,
1403
+ "step": 1990
1404
+ },
1405
+ {
1406
+ "epoch": 0.26187865197963894,
1407
+ "grad_norm": 1.4118777513504028,
1408
+ "learning_rate": 0.00014785002621919247,
1409
+ "loss": 0.5543,
1410
+ "step": 2000
1411
+ },
1412
+ {
1413
+ "epoch": 0.26318804523953715,
1414
+ "grad_norm": 1.2959351539611816,
1415
+ "learning_rate": 0.00014758783429470374,
1416
+ "loss": 0.4962,
1417
+ "step": 2010
1418
+ },
1419
+ {
1420
+ "epoch": 0.2644974384994353,
1421
+ "grad_norm": 1.3605815172195435,
1422
+ "learning_rate": 0.000147325642370215,
1423
+ "loss": 0.5699,
1424
+ "step": 2020
1425
+ },
1426
+ {
1427
+ "epoch": 0.2658068317593335,
1428
+ "grad_norm": 2.086613416671753,
1429
+ "learning_rate": 0.00014706345044572628,
1430
+ "loss": 0.565,
1431
+ "step": 2030
1432
+ },
1433
+ {
1434
+ "epoch": 0.2671162250192317,
1435
+ "grad_norm": 1.2892887592315674,
1436
+ "learning_rate": 0.00014680125852123757,
1437
+ "loss": 0.6062,
1438
+ "step": 2040
1439
+ },
1440
+ {
1441
+ "epoch": 0.2684256182791299,
1442
+ "grad_norm": 1.5760036706924438,
1443
+ "learning_rate": 0.00014653906659674884,
1444
+ "loss": 0.5642,
1445
+ "step": 2050
1446
+ },
1447
+ {
1448
+ "epoch": 0.2697350115390281,
1449
+ "grad_norm": 1.21380615234375,
1450
+ "learning_rate": 0.0001462768746722601,
1451
+ "loss": 0.5514,
1452
+ "step": 2060
1453
+ },
1454
+ {
1455
+ "epoch": 0.2710444047989263,
1456
+ "grad_norm": 1.4393121004104614,
1457
+ "learning_rate": 0.00014601468274777138,
1458
+ "loss": 0.5572,
1459
+ "step": 2070
1460
+ },
1461
+ {
1462
+ "epoch": 0.2723537980588245,
1463
+ "grad_norm": 1.2972021102905273,
1464
+ "learning_rate": 0.00014575249082328265,
1465
+ "loss": 0.535,
1466
+ "step": 2080
1467
+ },
1468
+ {
1469
+ "epoch": 0.2736631913187227,
1470
+ "grad_norm": 1.0208637714385986,
1471
+ "learning_rate": 0.00014549029889879392,
1472
+ "loss": 0.5835,
1473
+ "step": 2090
1474
+ },
1475
+ {
1476
+ "epoch": 0.2749725845786209,
1477
+ "grad_norm": 1.4418736696243286,
1478
+ "learning_rate": 0.00014522810697430521,
1479
+ "loss": 0.4829,
1480
+ "step": 2100
1481
+ },
1482
+ {
1483
+ "epoch": 0.2762819778385191,
1484
+ "grad_norm": 1.4326051473617554,
1485
+ "learning_rate": 0.00014496591504981648,
1486
+ "loss": 0.4711,
1487
+ "step": 2110
1488
+ },
1489
+ {
1490
+ "epoch": 0.27759137109841725,
1491
+ "grad_norm": 1.497841715812683,
1492
+ "learning_rate": 0.00014470372312532775,
1493
+ "loss": 0.4935,
1494
+ "step": 2120
1495
+ },
1496
+ {
1497
+ "epoch": 0.27890076435831546,
1498
+ "grad_norm": 1.5082463026046753,
1499
+ "learning_rate": 0.00014444153120083902,
1500
+ "loss": 0.4979,
1501
+ "step": 2130
1502
+ },
1503
+ {
1504
+ "epoch": 0.28021015761821366,
1505
+ "grad_norm": 1.2458934783935547,
1506
+ "learning_rate": 0.0001441793392763503,
1507
+ "loss": 0.5644,
1508
+ "step": 2140
1509
+ },
1510
+ {
1511
+ "epoch": 0.28151955087811187,
1512
+ "grad_norm": 1.730130910873413,
1513
+ "learning_rate": 0.00014391714735186156,
1514
+ "loss": 0.4749,
1515
+ "step": 2150
1516
+ },
1517
+ {
1518
+ "epoch": 0.2828289441380101,
1519
+ "grad_norm": 1.2587112188339233,
1520
+ "learning_rate": 0.00014365495542737283,
1521
+ "loss": 0.5175,
1522
+ "step": 2160
1523
+ },
1524
+ {
1525
+ "epoch": 0.2841383373979082,
1526
+ "grad_norm": 1.431119441986084,
1527
+ "learning_rate": 0.0001433927635028841,
1528
+ "loss": 0.5597,
1529
+ "step": 2170
1530
+ },
1531
+ {
1532
+ "epoch": 0.28544773065780643,
1533
+ "grad_norm": 1.5383937358856201,
1534
+ "learning_rate": 0.0001431305715783954,
1535
+ "loss": 0.5153,
1536
+ "step": 2180
1537
+ },
1538
+ {
1539
+ "epoch": 0.28675712391770464,
1540
+ "grad_norm": 1.4311727285385132,
1541
+ "learning_rate": 0.00014286837965390666,
1542
+ "loss": 0.5452,
1543
+ "step": 2190
1544
+ },
1545
+ {
1546
+ "epoch": 0.28806651717760284,
1547
+ "grad_norm": 1.2555975914001465,
1548
+ "learning_rate": 0.00014260618772941793,
1549
+ "loss": 0.4937,
1550
+ "step": 2200
1551
+ },
1552
+ {
1553
+ "epoch": 0.28937591043750105,
1554
+ "grad_norm": 1.3781330585479736,
1555
+ "learning_rate": 0.0001423439958049292,
1556
+ "loss": 0.4537,
1557
+ "step": 2210
1558
+ },
1559
+ {
1560
+ "epoch": 0.2906853036973992,
1561
+ "grad_norm": 1.4810888767242432,
1562
+ "learning_rate": 0.00014208180388044047,
1563
+ "loss": 0.396,
1564
+ "step": 2220
1565
+ },
1566
+ {
1567
+ "epoch": 0.2919946969572974,
1568
+ "grad_norm": 1.6619911193847656,
1569
+ "learning_rate": 0.00014181961195595177,
1570
+ "loss": 0.4756,
1571
+ "step": 2230
1572
+ },
1573
+ {
1574
+ "epoch": 0.2933040902171956,
1575
+ "grad_norm": 1.3403065204620361,
1576
+ "learning_rate": 0.00014155742003146303,
1577
+ "loss": 0.5157,
1578
+ "step": 2240
1579
+ },
1580
+ {
1581
+ "epoch": 0.2946134834770938,
1582
+ "grad_norm": 1.4188278913497925,
1583
+ "learning_rate": 0.0001412952281069743,
1584
+ "loss": 0.5237,
1585
+ "step": 2250
1586
+ },
1587
+ {
1588
+ "epoch": 0.29592287673699197,
1589
+ "grad_norm": 1.852266550064087,
1590
+ "learning_rate": 0.00014103303618248557,
1591
+ "loss": 0.4558,
1592
+ "step": 2260
1593
+ },
1594
+ {
1595
+ "epoch": 0.2972322699968902,
1596
+ "grad_norm": 1.3092072010040283,
1597
+ "learning_rate": 0.00014077084425799687,
1598
+ "loss": 0.4437,
1599
+ "step": 2270
1600
+ },
1601
+ {
1602
+ "epoch": 0.2985416632567884,
1603
+ "grad_norm": 1.4190593957901,
1604
+ "learning_rate": 0.00014050865233350814,
1605
+ "loss": 0.4717,
1606
+ "step": 2280
1607
+ },
1608
+ {
1609
+ "epoch": 0.2998510565166866,
1610
+ "grad_norm": 1.4562608003616333,
1611
+ "learning_rate": 0.0001402464604090194,
1612
+ "loss": 0.4744,
1613
+ "step": 2290
1614
+ },
1615
+ {
1616
+ "epoch": 0.3011604497765848,
1617
+ "grad_norm": 1.4576420783996582,
1618
+ "learning_rate": 0.00013998426848453068,
1619
+ "loss": 0.4429,
1620
+ "step": 2300
1621
+ },
1622
+ {
1623
+ "epoch": 0.30246984303648294,
1624
+ "grad_norm": 1.867145299911499,
1625
+ "learning_rate": 0.00013972207656004194,
1626
+ "loss": 0.4881,
1627
+ "step": 2310
1628
+ },
1629
+ {
1630
+ "epoch": 0.30377923629638115,
1631
+ "grad_norm": 1.3077807426452637,
1632
+ "learning_rate": 0.00013945988463555324,
1633
+ "loss": 0.4067,
1634
+ "step": 2320
1635
+ },
1636
+ {
1637
+ "epoch": 0.30508862955627936,
1638
+ "grad_norm": 1.3587473630905151,
1639
+ "learning_rate": 0.0001391976927110645,
1640
+ "loss": 0.4428,
1641
+ "step": 2330
1642
+ },
1643
+ {
1644
+ "epoch": 0.30639802281617756,
1645
+ "grad_norm": 1.6012579202651978,
1646
+ "learning_rate": 0.00013893550078657578,
1647
+ "loss": 0.4572,
1648
+ "step": 2340
1649
+ },
1650
+ {
1651
+ "epoch": 0.30770741607607577,
1652
+ "grad_norm": 1.2226955890655518,
1653
+ "learning_rate": 0.00013867330886208705,
1654
+ "loss": 0.4117,
1655
+ "step": 2350
1656
+ },
1657
+ {
1658
+ "epoch": 0.3090168093359739,
1659
+ "grad_norm": 1.4615281820297241,
1660
+ "learning_rate": 0.00013841111693759834,
1661
+ "loss": 0.4561,
1662
+ "step": 2360
1663
+ },
1664
+ {
1665
+ "epoch": 0.3103262025958721,
1666
+ "grad_norm": 1.401014804840088,
1667
+ "learning_rate": 0.0001381489250131096,
1668
+ "loss": 0.441,
1669
+ "step": 2370
1670
+ },
1671
+ {
1672
+ "epoch": 0.31163559585577033,
1673
+ "grad_norm": 1.4875798225402832,
1674
+ "learning_rate": 0.00013788673308862088,
1675
+ "loss": 0.3991,
1676
+ "step": 2380
1677
+ },
1678
+ {
1679
+ "epoch": 0.31294498911566854,
1680
+ "grad_norm": 1.1867239475250244,
1681
+ "learning_rate": 0.00013762454116413215,
1682
+ "loss": 0.4223,
1683
+ "step": 2390
1684
+ },
1685
+ {
1686
+ "epoch": 0.31425438237556674,
1687
+ "grad_norm": 1.3172953128814697,
1688
+ "learning_rate": 0.00013736234923964342,
1689
+ "loss": 0.4388,
1690
+ "step": 2400
1691
+ },
1692
+ {
1693
+ "epoch": 0.3155637756354649,
1694
+ "grad_norm": 1.4044665098190308,
1695
+ "learning_rate": 0.00013710015731515472,
1696
+ "loss": 0.4102,
1697
+ "step": 2410
1698
+ },
1699
+ {
1700
+ "epoch": 0.3168731688953631,
1701
+ "grad_norm": 1.5709283351898193,
1702
+ "learning_rate": 0.00013683796539066599,
1703
+ "loss": 0.4837,
1704
+ "step": 2420
1705
+ },
1706
+ {
1707
+ "epoch": 0.3181825621552613,
1708
+ "grad_norm": 1.2237786054611206,
1709
+ "learning_rate": 0.00013657577346617725,
1710
+ "loss": 0.4452,
1711
+ "step": 2430
1712
+ },
1713
+ {
1714
+ "epoch": 0.3194919554151595,
1715
+ "grad_norm": 1.8869267702102661,
1716
+ "learning_rate": 0.00013631358154168852,
1717
+ "loss": 0.4077,
1718
+ "step": 2440
1719
+ },
1720
+ {
1721
+ "epoch": 0.3208013486750577,
1722
+ "grad_norm": 1.226117491722107,
1723
+ "learning_rate": 0.0001360513896171998,
1724
+ "loss": 0.4109,
1725
+ "step": 2450
1726
+ },
1727
+ {
1728
+ "epoch": 0.32211074193495587,
1729
+ "grad_norm": 1.6273385286331177,
1730
+ "learning_rate": 0.0001357891976927111,
1731
+ "loss": 0.3596,
1732
+ "step": 2460
1733
+ },
1734
+ {
1735
+ "epoch": 0.3234201351948541,
1736
+ "grad_norm": 1.4535574913024902,
1737
+ "learning_rate": 0.00013552700576822236,
1738
+ "loss": 0.3996,
1739
+ "step": 2470
1740
+ },
1741
+ {
1742
+ "epoch": 0.3247295284547523,
1743
+ "grad_norm": 1.6052360534667969,
1744
+ "learning_rate": 0.00013526481384373363,
1745
+ "loss": 0.4082,
1746
+ "step": 2480
1747
+ },
1748
+ {
1749
+ "epoch": 0.3260389217146505,
1750
+ "grad_norm": 1.9104530811309814,
1751
+ "learning_rate": 0.0001350026219192449,
1752
+ "loss": 0.4089,
1753
+ "step": 2490
1754
+ },
1755
+ {
1756
+ "epoch": 0.3273483149745487,
1757
+ "grad_norm": 1.6006613969802856,
1758
+ "learning_rate": 0.0001347404299947562,
1759
+ "loss": 0.3848,
1760
+ "step": 2500
1761
+ },
1762
+ {
1763
+ "epoch": 0.32865770823444684,
1764
+ "grad_norm": 1.4406352043151855,
1765
+ "learning_rate": 0.00013447823807026746,
1766
+ "loss": 0.3926,
1767
+ "step": 2510
1768
+ },
1769
+ {
1770
+ "epoch": 0.32996710149434505,
1771
+ "grad_norm": 1.3455756902694702,
1772
+ "learning_rate": 0.00013421604614577873,
1773
+ "loss": 0.4203,
1774
+ "step": 2520
1775
+ },
1776
+ {
1777
+ "epoch": 0.33127649475424326,
1778
+ "grad_norm": 1.7718679904937744,
1779
+ "learning_rate": 0.00013395385422129,
1780
+ "loss": 0.3765,
1781
+ "step": 2530
1782
+ },
1783
+ {
1784
+ "epoch": 0.33258588801414146,
1785
+ "grad_norm": 1.410130500793457,
1786
+ "learning_rate": 0.00013369166229680127,
1787
+ "loss": 0.3646,
1788
+ "step": 2540
1789
+ },
1790
+ {
1791
+ "epoch": 0.33389528127403967,
1792
+ "grad_norm": 1.6361408233642578,
1793
+ "learning_rate": 0.00013342947037231254,
1794
+ "loss": 0.3917,
1795
+ "step": 2550
1796
+ },
1797
+ {
1798
+ "epoch": 0.3352046745339378,
1799
+ "grad_norm": 1.7627660036087036,
1800
+ "learning_rate": 0.0001331672784478238,
1801
+ "loss": 0.367,
1802
+ "step": 2560
1803
+ },
1804
+ {
1805
+ "epoch": 0.336514067793836,
1806
+ "grad_norm": 1.2431906461715698,
1807
+ "learning_rate": 0.00013290508652333508,
1808
+ "loss": 0.3708,
1809
+ "step": 2570
1810
+ },
1811
+ {
1812
+ "epoch": 0.33782346105373423,
1813
+ "grad_norm": 1.4763669967651367,
1814
+ "learning_rate": 0.00013264289459884634,
1815
+ "loss": 0.377,
1816
+ "step": 2580
1817
+ },
1818
+ {
1819
+ "epoch": 0.33913285431363244,
1820
+ "grad_norm": 2.1701712608337402,
1821
+ "learning_rate": 0.00013238070267435761,
1822
+ "loss": 0.344,
1823
+ "step": 2590
1824
+ },
1825
+ {
1826
+ "epoch": 0.3404422475735306,
1827
+ "grad_norm": 1.4388126134872437,
1828
+ "learning_rate": 0.0001321185107498689,
1829
+ "loss": 0.3556,
1830
+ "step": 2600
1831
+ },
1832
+ {
1833
+ "epoch": 0.3417516408334288,
1834
+ "grad_norm": 1.2981114387512207,
1835
+ "learning_rate": 0.00013185631882538018,
1836
+ "loss": 0.3272,
1837
+ "step": 2610
1838
+ },
1839
+ {
1840
+ "epoch": 0.343061034093327,
1841
+ "grad_norm": 1.539335012435913,
1842
+ "learning_rate": 0.00013159412690089145,
1843
+ "loss": 0.4132,
1844
+ "step": 2620
1845
+ },
1846
+ {
1847
+ "epoch": 0.3443704273532252,
1848
+ "grad_norm": 1.9272770881652832,
1849
+ "learning_rate": 0.00013133193497640272,
1850
+ "loss": 0.4121,
1851
+ "step": 2630
1852
+ },
1853
+ {
1854
+ "epoch": 0.3456798206131234,
1855
+ "grad_norm": 1.4415314197540283,
1856
+ "learning_rate": 0.000131069743051914,
1857
+ "loss": 0.3595,
1858
+ "step": 2640
1859
+ },
1860
+ {
1861
+ "epoch": 0.34698921387302156,
1862
+ "grad_norm": 1.3155860900878906,
1863
+ "learning_rate": 0.00013080755112742528,
1864
+ "loss": 0.3611,
1865
+ "step": 2650
1866
+ },
1867
+ {
1868
+ "epoch": 0.34829860713291977,
1869
+ "grad_norm": 1.507858157157898,
1870
+ "learning_rate": 0.00013054535920293655,
1871
+ "loss": 0.3813,
1872
+ "step": 2660
1873
+ },
1874
+ {
1875
+ "epoch": 0.349608000392818,
1876
+ "grad_norm": 1.5444693565368652,
1877
+ "learning_rate": 0.00013028316727844782,
1878
+ "loss": 0.3527,
1879
+ "step": 2670
1880
+ },
1881
+ {
1882
+ "epoch": 0.3509173936527162,
1883
+ "grad_norm": 1.4008456468582153,
1884
+ "learning_rate": 0.0001300209753539591,
1885
+ "loss": 0.3573,
1886
+ "step": 2680
1887
+ },
1888
+ {
1889
+ "epoch": 0.3522267869126144,
1890
+ "grad_norm": 1.6443661451339722,
1891
+ "learning_rate": 0.00012975878342947039,
1892
+ "loss": 0.3885,
1893
+ "step": 2690
1894
+ },
1895
+ {
1896
+ "epoch": 0.35353618017251254,
1897
+ "grad_norm": 1.513431429862976,
1898
+ "learning_rate": 0.00012949659150498165,
1899
+ "loss": 0.3332,
1900
+ "step": 2700
1901
+ },
1902
+ {
1903
+ "epoch": 0.35484557343241074,
1904
+ "grad_norm": 1.6663899421691895,
1905
+ "learning_rate": 0.00012923439958049292,
1906
+ "loss": 0.3769,
1907
+ "step": 2710
1908
+ },
1909
+ {
1910
+ "epoch": 0.35615496669230895,
1911
+ "grad_norm": 1.2655925750732422,
1912
+ "learning_rate": 0.0001289722076560042,
1913
+ "loss": 0.4177,
1914
+ "step": 2720
1915
+ },
1916
+ {
1917
+ "epoch": 0.35746435995220716,
1918
+ "grad_norm": 1.324833869934082,
1919
+ "learning_rate": 0.00012871001573151546,
1920
+ "loss": 0.3501,
1921
+ "step": 2730
1922
+ },
1923
+ {
1924
+ "epoch": 0.35877375321210536,
1925
+ "grad_norm": 1.4842655658721924,
1926
+ "learning_rate": 0.00012844782380702676,
1927
+ "loss": 0.3223,
1928
+ "step": 2740
1929
+ },
1930
+ {
1931
+ "epoch": 0.3600831464720035,
1932
+ "grad_norm": 1.4087761640548706,
1933
+ "learning_rate": 0.00012818563188253803,
1934
+ "loss": 0.3308,
1935
+ "step": 2750
1936
+ },
1937
+ {
1938
+ "epoch": 0.3613925397319017,
1939
+ "grad_norm": 1.7493972778320312,
1940
+ "learning_rate": 0.0001279234399580493,
1941
+ "loss": 0.3655,
1942
+ "step": 2760
1943
+ },
1944
+ {
1945
+ "epoch": 0.3627019329917999,
1946
+ "grad_norm": 1.4829336404800415,
1947
+ "learning_rate": 0.00012766124803356056,
1948
+ "loss": 0.3674,
1949
+ "step": 2770
1950
+ },
1951
+ {
1952
+ "epoch": 0.36401132625169813,
1953
+ "grad_norm": 1.39944589138031,
1954
+ "learning_rate": 0.00012739905610907186,
1955
+ "loss": 0.3285,
1956
+ "step": 2780
1957
+ },
1958
+ {
1959
+ "epoch": 0.36532071951159634,
1960
+ "grad_norm": 1.5995631217956543,
1961
+ "learning_rate": 0.00012713686418458313,
1962
+ "loss": 0.3431,
1963
+ "step": 2790
1964
+ },
1965
+ {
1966
+ "epoch": 0.3666301127714945,
1967
+ "grad_norm": 1.0113691091537476,
1968
+ "learning_rate": 0.0001268746722600944,
1969
+ "loss": 0.3389,
1970
+ "step": 2800
1971
+ },
1972
+ {
1973
+ "epoch": 0.3679395060313927,
1974
+ "grad_norm": 1.6544948816299438,
1975
+ "learning_rate": 0.00012661248033560567,
1976
+ "loss": 0.323,
1977
+ "step": 2810
1978
+ },
1979
+ {
1980
+ "epoch": 0.3692488992912909,
1981
+ "grad_norm": 1.8022606372833252,
1982
+ "learning_rate": 0.00012635028841111694,
1983
+ "loss": 0.3777,
1984
+ "step": 2820
1985
+ },
1986
+ {
1987
+ "epoch": 0.3705582925511891,
1988
+ "grad_norm": 1.6005665063858032,
1989
+ "learning_rate": 0.00012608809648662823,
1990
+ "loss": 0.3482,
1991
+ "step": 2830
1992
+ },
1993
+ {
1994
+ "epoch": 0.3718676858110873,
1995
+ "grad_norm": 1.2550064325332642,
1996
+ "learning_rate": 0.0001258259045621395,
1997
+ "loss": 0.3288,
1998
+ "step": 2840
1999
+ },
2000
+ {
2001
+ "epoch": 0.37317707907098546,
2002
+ "grad_norm": 2.43110728263855,
2003
+ "learning_rate": 0.00012556371263765077,
2004
+ "loss": 0.3511,
2005
+ "step": 2850
2006
+ },
2007
+ {
2008
+ "epoch": 0.37448647233088367,
2009
+ "grad_norm": 1.5041906833648682,
2010
+ "learning_rate": 0.00012530152071316204,
2011
+ "loss": 0.3578,
2012
+ "step": 2860
2013
+ },
2014
+ {
2015
+ "epoch": 0.3757958655907819,
2016
+ "grad_norm": 1.6031140089035034,
2017
+ "learning_rate": 0.0001250393287886733,
2018
+ "loss": 0.3213,
2019
+ "step": 2870
2020
+ },
2021
+ {
2022
+ "epoch": 0.3771052588506801,
2023
+ "grad_norm": 1.025795817375183,
2024
+ "learning_rate": 0.0001247771368641846,
2025
+ "loss": 0.3352,
2026
+ "step": 2880
2027
+ },
2028
+ {
2029
+ "epoch": 0.3784146521105783,
2030
+ "grad_norm": 1.934812068939209,
2031
+ "learning_rate": 0.00012451494493969587,
2032
+ "loss": 0.3365,
2033
+ "step": 2890
2034
+ },
2035
+ {
2036
+ "epoch": 0.37972404537047644,
2037
+ "grad_norm": 1.0730398893356323,
2038
+ "learning_rate": 0.00012425275301520714,
2039
+ "loss": 0.3365,
2040
+ "step": 2900
2041
+ },
2042
+ {
2043
+ "epoch": 0.38103343863037464,
2044
+ "grad_norm": 1.3496712446212769,
2045
+ "learning_rate": 0.0001239905610907184,
2046
+ "loss": 0.3548,
2047
+ "step": 2910
2048
+ },
2049
+ {
2050
+ "epoch": 0.38234283189027285,
2051
+ "grad_norm": 1.3053911924362183,
2052
+ "learning_rate": 0.0001237283691662297,
2053
+ "loss": 0.3563,
2054
+ "step": 2920
2055
+ },
2056
+ {
2057
+ "epoch": 0.38365222515017106,
2058
+ "grad_norm": 1.3640882968902588,
2059
+ "learning_rate": 0.00012346617724174098,
2060
+ "loss": 0.365,
2061
+ "step": 2930
2062
+ },
2063
+ {
2064
+ "epoch": 0.38496161841006926,
2065
+ "grad_norm": 1.3266191482543945,
2066
+ "learning_rate": 0.00012320398531725225,
2067
+ "loss": 0.2981,
2068
+ "step": 2940
2069
+ },
2070
+ {
2071
+ "epoch": 0.3862710116699674,
2072
+ "grad_norm": 1.32815682888031,
2073
+ "learning_rate": 0.00012294179339276352,
2074
+ "loss": 0.3544,
2075
+ "step": 2950
2076
+ },
2077
+ {
2078
+ "epoch": 0.3875804049298656,
2079
+ "grad_norm": 1.4236459732055664,
2080
+ "learning_rate": 0.00012267960146827479,
2081
+ "loss": 0.3095,
2082
+ "step": 2960
2083
+ },
2084
+ {
2085
+ "epoch": 0.3888897981897638,
2086
+ "grad_norm": 1.1536756753921509,
2087
+ "learning_rate": 0.00012241740954378605,
2088
+ "loss": 0.3125,
2089
+ "step": 2970
2090
+ },
2091
+ {
2092
+ "epoch": 0.39019919144966203,
2093
+ "grad_norm": 1.4237791299819946,
2094
+ "learning_rate": 0.00012215521761929732,
2095
+ "loss": 0.3207,
2096
+ "step": 2980
2097
+ },
2098
+ {
2099
+ "epoch": 0.3915085847095602,
2100
+ "grad_norm": 1.4023237228393555,
2101
+ "learning_rate": 0.0001218930256948086,
2102
+ "loss": 0.3714,
2103
+ "step": 2990
2104
+ },
2105
+ {
2106
+ "epoch": 0.3928179779694584,
2107
+ "grad_norm": 1.3556010723114014,
2108
+ "learning_rate": 0.00012163083377031987,
2109
+ "loss": 0.3313,
2110
+ "step": 3000
2111
+ },
2112
+ {
2113
+ "epoch": 0.3941273712293566,
2114
+ "grad_norm": 1.2301980257034302,
2115
+ "learning_rate": 0.00012136864184583114,
2116
+ "loss": 0.3062,
2117
+ "step": 3010
2118
+ },
2119
+ {
2120
+ "epoch": 0.3954367644892548,
2121
+ "grad_norm": 1.3532170057296753,
2122
+ "learning_rate": 0.00012110644992134244,
2123
+ "loss": 0.2946,
2124
+ "step": 3020
2125
+ },
2126
+ {
2127
+ "epoch": 0.396746157749153,
2128
+ "grad_norm": 1.2680764198303223,
2129
+ "learning_rate": 0.00012084425799685371,
2130
+ "loss": 0.3005,
2131
+ "step": 3030
2132
+ },
2133
+ {
2134
+ "epoch": 0.39805555100905116,
2135
+ "grad_norm": 1.5346810817718506,
2136
+ "learning_rate": 0.00012058206607236498,
2137
+ "loss": 0.3363,
2138
+ "step": 3040
2139
+ },
2140
+ {
2141
+ "epoch": 0.39936494426894936,
2142
+ "grad_norm": 1.423195242881775,
2143
+ "learning_rate": 0.00012031987414787625,
2144
+ "loss": 0.3294,
2145
+ "step": 3050
2146
+ },
2147
+ {
2148
+ "epoch": 0.40067433752884757,
2149
+ "grad_norm": 1.599571704864502,
2150
+ "learning_rate": 0.00012005768222338753,
2151
+ "loss": 0.3469,
2152
+ "step": 3060
2153
+ },
2154
+ {
2155
+ "epoch": 0.4019837307887458,
2156
+ "grad_norm": 1.2103453874588013,
2157
+ "learning_rate": 0.0001197954902988988,
2158
+ "loss": 0.2827,
2159
+ "step": 3070
2160
+ },
2161
+ {
2162
+ "epoch": 0.403293124048644,
2163
+ "grad_norm": 1.3197276592254639,
2164
+ "learning_rate": 0.00011953329837441007,
2165
+ "loss": 0.3194,
2166
+ "step": 3080
2167
+ },
2168
+ {
2169
+ "epoch": 0.40460251730854213,
2170
+ "grad_norm": 1.291038990020752,
2171
+ "learning_rate": 0.00011927110644992135,
2172
+ "loss": 0.2798,
2173
+ "step": 3090
2174
+ },
2175
+ {
2176
+ "epoch": 0.40591191056844034,
2177
+ "grad_norm": 1.1556978225708008,
2178
+ "learning_rate": 0.00011900891452543262,
2179
+ "loss": 0.3318,
2180
+ "step": 3100
2181
+ },
2182
+ {
2183
+ "epoch": 0.40722130382833854,
2184
+ "grad_norm": 1.3520278930664062,
2185
+ "learning_rate": 0.0001187467226009439,
2186
+ "loss": 0.3222,
2187
+ "step": 3110
2188
+ },
2189
+ {
2190
+ "epoch": 0.40853069708823675,
2191
+ "grad_norm": 1.0671277046203613,
2192
+ "learning_rate": 0.00011848453067645517,
2193
+ "loss": 0.268,
2194
+ "step": 3120
2195
+ },
2196
+ {
2197
+ "epoch": 0.40984009034813496,
2198
+ "grad_norm": 1.442131757736206,
2199
+ "learning_rate": 0.00011822233875196644,
2200
+ "loss": 0.3028,
2201
+ "step": 3130
2202
+ },
2203
+ {
2204
+ "epoch": 0.4111494836080331,
2205
+ "grad_norm": 1.5673497915267944,
2206
+ "learning_rate": 0.00011796014682747771,
2207
+ "loss": 0.31,
2208
+ "step": 3140
2209
+ },
2210
+ {
2211
+ "epoch": 0.4124588768679313,
2212
+ "grad_norm": 1.2009717226028442,
2213
+ "learning_rate": 0.00011769795490298898,
2214
+ "loss": 0.2986,
2215
+ "step": 3150
2216
+ },
2217
+ {
2218
+ "epoch": 0.4137682701278295,
2219
+ "grad_norm": 1.2754930257797241,
2220
+ "learning_rate": 0.00011743576297850027,
2221
+ "loss": 0.3352,
2222
+ "step": 3160
2223
+ },
2224
+ {
2225
+ "epoch": 0.4150776633877277,
2226
+ "grad_norm": 1.6189430952072144,
2227
+ "learning_rate": 0.00011717357105401154,
2228
+ "loss": 0.3804,
2229
+ "step": 3170
2230
+ },
2231
+ {
2232
+ "epoch": 0.41638705664762593,
2233
+ "grad_norm": 1.6117827892303467,
2234
+ "learning_rate": 0.00011691137912952281,
2235
+ "loss": 0.3239,
2236
+ "step": 3180
2237
+ },
2238
+ {
2239
+ "epoch": 0.4176964499075241,
2240
+ "grad_norm": 1.7495907545089722,
2241
+ "learning_rate": 0.00011664918720503408,
2242
+ "loss": 0.3145,
2243
+ "step": 3190
2244
+ },
2245
+ {
2246
+ "epoch": 0.4190058431674223,
2247
+ "grad_norm": 1.2301905155181885,
2248
+ "learning_rate": 0.00011638699528054538,
2249
+ "loss": 0.2776,
2250
+ "step": 3200
2251
+ },
2252
+ {
2253
+ "epoch": 0.4203152364273205,
2254
+ "grad_norm": 1.3571341037750244,
2255
+ "learning_rate": 0.00011612480335605665,
2256
+ "loss": 0.3019,
2257
+ "step": 3210
2258
+ },
2259
+ {
2260
+ "epoch": 0.4216246296872187,
2261
+ "grad_norm": 0.9271483421325684,
2262
+ "learning_rate": 0.00011586261143156792,
2263
+ "loss": 0.2929,
2264
+ "step": 3220
2265
+ },
2266
+ {
2267
+ "epoch": 0.4229340229471169,
2268
+ "grad_norm": 1.294146180152893,
2269
+ "learning_rate": 0.00011560041950707918,
2270
+ "loss": 0.3095,
2271
+ "step": 3230
2272
+ },
2273
+ {
2274
+ "epoch": 0.42424341620701506,
2275
+ "grad_norm": 1.5177209377288818,
2276
+ "learning_rate": 0.00011533822758259045,
2277
+ "loss": 0.2714,
2278
+ "step": 3240
2279
+ },
2280
+ {
2281
+ "epoch": 0.42555280946691326,
2282
+ "grad_norm": 1.1218962669372559,
2283
+ "learning_rate": 0.00011507603565810175,
2284
+ "loss": 0.282,
2285
+ "step": 3250
2286
+ },
2287
+ {
2288
+ "epoch": 0.42686220272681147,
2289
+ "grad_norm": 1.2807728052139282,
2290
+ "learning_rate": 0.00011481384373361302,
2291
+ "loss": 0.3461,
2292
+ "step": 3260
2293
+ },
2294
+ {
2295
+ "epoch": 0.4281715959867097,
2296
+ "grad_norm": 1.1680692434310913,
2297
+ "learning_rate": 0.00011455165180912429,
2298
+ "loss": 0.2842,
2299
+ "step": 3270
2300
+ },
2301
+ {
2302
+ "epoch": 0.4294809892466079,
2303
+ "grad_norm": 1.6534638404846191,
2304
+ "learning_rate": 0.00011428945988463556,
2305
+ "loss": 0.2774,
2306
+ "step": 3280
2307
+ },
2308
+ {
2309
+ "epoch": 0.43079038250650603,
2310
+ "grad_norm": 1.2321938276290894,
2311
+ "learning_rate": 0.00011402726796014683,
2312
+ "loss": 0.2841,
2313
+ "step": 3290
2314
+ },
2315
+ {
2316
+ "epoch": 0.43209977576640424,
2317
+ "grad_norm": 1.6666522026062012,
2318
+ "learning_rate": 0.00011376507603565811,
2319
+ "loss": 0.2993,
2320
+ "step": 3300
2321
+ },
2322
+ {
2323
+ "epoch": 0.43340916902630244,
2324
+ "grad_norm": 1.8330938816070557,
2325
+ "learning_rate": 0.00011350288411116938,
2326
+ "loss": 0.2834,
2327
+ "step": 3310
2328
+ },
2329
+ {
2330
+ "epoch": 0.43471856228620065,
2331
+ "grad_norm": 1.570809245109558,
2332
+ "learning_rate": 0.00011324069218668065,
2333
+ "loss": 0.2885,
2334
+ "step": 3320
2335
+ },
2336
+ {
2337
+ "epoch": 0.4360279555460988,
2338
+ "grad_norm": 1.4093183279037476,
2339
+ "learning_rate": 0.00011297850026219192,
2340
+ "loss": 0.2872,
2341
+ "step": 3330
2342
+ },
2343
+ {
2344
+ "epoch": 0.437337348805997,
2345
+ "grad_norm": 0.8298211097717285,
2346
+ "learning_rate": 0.00011271630833770321,
2347
+ "loss": 0.2884,
2348
+ "step": 3340
2349
+ },
2350
+ {
2351
+ "epoch": 0.4386467420658952,
2352
+ "grad_norm": 1.1143261194229126,
2353
+ "learning_rate": 0.00011245411641321448,
2354
+ "loss": 0.279,
2355
+ "step": 3350
2356
+ },
2357
+ {
2358
+ "epoch": 0.4399561353257934,
2359
+ "grad_norm": 1.1568537950515747,
2360
+ "learning_rate": 0.00011219192448872575,
2361
+ "loss": 0.2724,
2362
+ "step": 3360
2363
+ },
2364
+ {
2365
+ "epoch": 0.4412655285856916,
2366
+ "grad_norm": 0.8700618147850037,
2367
+ "learning_rate": 0.00011192973256423702,
2368
+ "loss": 0.2563,
2369
+ "step": 3370
2370
+ },
2371
+ {
2372
+ "epoch": 0.4425749218455898,
2373
+ "grad_norm": 0.974319577217102,
2374
+ "learning_rate": 0.00011166754063974829,
2375
+ "loss": 0.2864,
2376
+ "step": 3380
2377
+ },
2378
+ {
2379
+ "epoch": 0.443884315105488,
2380
+ "grad_norm": 0.9288910031318665,
2381
+ "learning_rate": 0.00011140534871525958,
2382
+ "loss": 0.2717,
2383
+ "step": 3390
2384
+ },
2385
+ {
2386
+ "epoch": 0.4451937083653862,
2387
+ "grad_norm": 1.0942648649215698,
2388
+ "learning_rate": 0.00011114315679077085,
2389
+ "loss": 0.2625,
2390
+ "step": 3400
2391
+ },
2392
+ {
2393
+ "epoch": 0.4465031016252844,
2394
+ "grad_norm": 1.3224159479141235,
2395
+ "learning_rate": 0.00011088096486628212,
2396
+ "loss": 0.2719,
2397
+ "step": 3410
2398
+ }
2399
+ ],
2400
+ "logging_steps": 10,
2401
+ "max_steps": 7638,
2402
+ "num_input_tokens_seen": 0,
2403
+ "num_train_epochs": 1,
2404
+ "save_steps": 10,
2405
+ "stateful_callbacks": {
2406
+ "TrainerControl": {
2407
+ "args": {
2408
+ "should_epoch_stop": false,
2409
+ "should_evaluate": false,
2410
+ "should_log": false,
2411
+ "should_save": true,
2412
+ "should_training_stop": false
2413
+ },
2414
+ "attributes": {}
2415
+ }
2416
+ },
2417
+ "total_flos": 2.8131850187780976e+17,
2418
+ "train_batch_size": 1,
2419
+ "trial_name": null,
2420
+ "trial_params": null
2421
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28dc0b9cd89143400df8f8fafb91099efdbf577030cb3ebd021914598e90413e
3
+ size 6097
unsloth/roleplay-zh-sharegpt-gpt4-data.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 最终完整版代码 - 已根据RTX 3070 (8GB)优化
2
+
3
+ import torch
4
+ from unsloth import FastLanguageModel
5
+ from datasets import load_dataset
6
+ from trl import SFTTrainer
7
+ from transformers import TrainingArguments, pipeline
8
+ import os
9
+
10
+ # --- 本地路径配置 (无需更改) ---
11
+ local_model_path = "./gemma-3-4b-it-qat-unsloth-bnb-4bit"
12
+ local_data_dir = "./roleplay-zh-sharegpt-gpt4-data"
13
+ local_data_file = os.path.join(local_data_dir, "sharegpt_formatted_data-evol-gpt35.jsonl")
14
+ # --- 配置结束 ---
15
+
16
+
17
+ # 1. 加载模型和分词器 (无需更改)
18
+ max_seq_length = 2048
19
+ dtype = None
20
+ load_in_4bit = True
21
+
22
+ print(f"✅ 步骤 1/5: 正在从本地路径 '{local_model_path}' 加载模型和分词器...")
23
+ model, tokenizer = FastLanguageModel.from_pretrained(
24
+ model_name=local_model_path,
25
+ max_seq_length=max_seq_length,
26
+ dtype=dtype,
27
+ load_in_4bit=load_in_4bit,
28
+ )
29
+
30
+ # 2. 配置 LoRA (无需更改)
31
+ print("✅ 步骤 2/5: 正在配置 LoRA 适配器...")
32
+ model = FastLanguageModel.get_peft_model(
33
+ model,
34
+ r=16,
35
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
36
+ lora_alpha=16,
37
+ lora_dropout=0,
38
+ bias="none",
39
+ use_gradient_checkpointing=True,
40
+ random_state=3407,
41
+ )
42
+
43
+ # 3. 加载和准备数据集 (无需更改)
44
+ alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
45
+
46
+ ### Instruction:
47
+ {}
48
+
49
+ ### Input:
50
+ {}
51
+
52
+ ### Response:
53
+ {}"""
54
+
55
+ EOS_TOKEN = tokenizer.eos_token
56
+
57
+ def formatting_prompts_func(examples):
58
+ all_texts = []
59
+ for i in range(len(examples['system_prompt'])):
60
+ system_prompt = examples['system_prompt'][i]
61
+ conversations = examples['conversations'][i]
62
+ for j in range(0, len(conversations), 2):
63
+ if j + 1 < len(conversations):
64
+ human_turn = conversations[j]
65
+ gpt_turn = conversations[j+1]
66
+ if human_turn['from'] == 'human' and gpt_turn['from'] == 'gpt':
67
+ instruction = system_prompt
68
+ input_text = human_turn['value']
69
+ output_text = gpt_turn['value']
70
+ text = alpaca_prompt.format(instruction, input_text, output_text) + EOS_TOKEN
71
+ all_texts.append(text)
72
+ return {"text": all_texts}
73
+
74
+ print(f"✅ 步骤 3/5: 正在从本地文件 '{local_data_file}' 加载数据集...")
75
+ dataset = load_dataset("json", data_files=local_data_file, split="train")
76
+ dataset = dataset.map(
77
+ formatting_prompts_func,
78
+ batched=True,
79
+ remove_columns=dataset.column_names
80
+ )
81
+ print(f"🎉 数据集处理完成!总共生成了 {len(dataset)} 条训练样本。")
82
+
83
+
84
+ # 4. 配置训练参数并开始训练
85
+ print("\n✅ 步骤 4/5: 开始模型微调...")
86
+ trainer = SFTTrainer(
87
+ model=model,
88
+ tokenizer=tokenizer,
89
+ train_dataset=dataset,
90
+ dataset_text_field="text",
91
+ max_seq_length=max_seq_length,
92
+ dataset_num_proc=2,
93
+ packing=False,
94
+ args=TrainingArguments(
95
+ # --- 以下是根据您的硬件进行的优化 ---
96
+ per_device_train_batch_size = 1, # <--- 更改: 从 2 改为 1,降低显存峰值
97
+ gradient_accumulation_steps = 8, # <--- 更改: 从 4 改为 8,保持有效批量大小不变
98
+ # ------------------------------------
99
+
100
+ warmup_steps=10,
101
+ num_train_epochs=1,
102
+ learning_rate=2e-4,
103
+ # 您的环境支持 bf16,这将自动启用
104
+ fp16=not torch.cuda.is_bf16_supported(),
105
+ bf16=torch.cuda.is_bf16_supported(),
106
+ logging_steps=10,
107
+ optim="adamw_8bit",
108
+ weight_decay=0.01,
109
+ lr_scheduler_type="linear",
110
+ seed=3407,
111
+ output_dir="outputs",
112
+
113
+ # --- 检查点配置优化 ---
114
+ save_strategy="steps",
115
+ save_steps=10, # <--- 更改: 改为更合理的保存频率
116
+ save_total_limit=3,
117
+ # ----------------------
118
+ ),
119
+ )
120
+
121
+ trainer.train(resume_from_checkpoint = True)
122
+
123
+ # 5. 保存并测试 (无需更改)
124
+ print("\n✅ 步骤 5/5: 微调完成,开始推理测试...")
125
+ final_model_path = "gemma3_roleplay_lora_local"
126
+ model.save_pretrained(final_model_path)
127
+ tokenizer.save_pretrained(final_model_path)
128
+ print(f"🎉 最终模型已保存到 '{final_model_path}' 文件夹。")
129
+
130
+ # --- 推理 ---
131
+ system_prompt_virene = """角色名称:薇莲(Virene)
132
+ 开场语:「真相,始终都存在于迷雾之中。」
133
+ 身份背景:薇莲是一名神秘的赏金猎人,常常被人雇佣去完成各种危险任务,从而掩盖她本身的身份和目的。据传,薇莲早年曾在某个神秘组织中学习过各种神秘技能,所以她的能力非常高超。
134
+ 性格特征:薇莲总是保持着冷静、沉着的态度,不论面对何种情况都能保持冷静。同时,她总是带有一定的神秘色彩,让人无法洞察她真正的想法和动机。她对任务非常认���,但很少会谈及自己的生活和过去,因此让人对她的身份感到好奇。
135
+ 语言风格:薇莲的语言简洁有力,通常只说必要的话语来传达她的意思。她的语气总是带有一丝威慑力,让人不敢轻易挑战她。
136
+ 行为特征:薇莲行动迅速而准确,总是在保持低调的同时完成任务。她具备很强的隐蔽能力,在执行任务的时候几乎不留痕迹,让人难以发现她的存在。不过,她也有时候会让人感到无法理解,经常出现在决定性瞬间,让人觉得她真正的动机仍旧是个谜。"""
137
+ user_input = "我需要一个赏金猎人完成一个任务,听说您非常厉害。我们可以谈一下合作吗?"
138
+
139
+ prompt = alpaca_prompt.format(
140
+ system_prompt_virene,
141
+ user_input,
142
+ "",
143
+ )
144
+
145
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
146
+ outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_p=0.9, pad_token_id=tokenizer.eos_token_id)
147
+
148
+ print("\n===== 推理结果 =====")
149
+ print(outputs[0]['generated_text'])
unsloth/test_roleplayer_lora.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =================================================================
2
+ # LoRA模型推理测试脚本 (首演)
3
+ # =================================================================
4
+
5
+ import torch
6
+ from unsloth import FastLanguageModel
7
+ from transformers import pipeline
8
+ import os
9
+
10
+ # --- 1. 配置路径 ---
11
+
12
+ # 您的Gemma基础模型的本地路径
13
+ base_model_path = "./gemma-3-4b-it-qat-unsloth-bnb-4bit"
14
+
15
+ # 您刚刚提取并重命名的LoRA模型的路径
16
+ # 请确保这个文件夹名和您自己保存的完全一致!
17
+ lora_model_path = "./lora_model_roleplayer_actor"
18
+
19
+ print("✅ 步骤 1/4: 路径配置完成。")
20
+ print(f" - 基础模型路径: {base_model_path}")
21
+ print(f" - LoRA模型路径: {lora_model_path}")
22
+
23
+ # --- 2. 加载模型与注入灵魂 ---
24
+
25
+ print("\n✅ 步骤 2/4: 正在加载基础模型...")
26
+
27
+ # 以4-bit精度加载基础模型,确保与训练时一致
28
+ model, tokenizer = FastLanguageModel.from_pretrained(
29
+ model_name = base_model_path,
30
+ load_in_4bit = True,
31
+ )
32
+
33
+ print(" - 基础模型加载成功。")
34
+
35
+ # 这是最关键的一步:将LoRA的“灵魂”注入到基础模型中!
36
+ # FastLanguageModel会自动处理所有复杂的合并工作
37
+ print(f" - 正在从 '{lora_model_path}' 加载并注入LoRA适配器...")
38
+ model.load_adapter(lora_model_path)
39
+ print(" - LoRA灵魂注入成功!模型已准备就绪。")
40
+
41
+
42
+ # --- 3. 准备“剧本” (Prompt) ---
43
+
44
+ print("\n✅ 步骤 3/4: 正在准备测试剧本...")
45
+
46
+ # 我们使用和训练时完全相同的 Alpaca 格式
47
+ # 这能确保模型能最好地理解我们的指令
48
+ alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
49
+
50
+ ### Instruction:
51
+ {}
52
+
53
+ ### Input:
54
+ {}
55
+
56
+ ### Response:
57
+ {}"""
58
+
59
+ # --- 定义我们的角色和对话 ---
60
+
61
+ # 角色1: 薇莲 (一个我们训练过的角色)
62
+ instruction_virene = """角色名称:薇莲(Virene)
63
+ 开场语:「真相,始终都存在于迷雾之中。」
64
+ 身份背景:薇莲是一名神秘的赏金猎人,常常被人雇佣去完成各种危险任务,从而掩盖她本身的身份和目的。据传,薇莲早年曾在某个神秘组织中学习过各种神秘技能,所以她的能力非常高超。
65
+ 性格特征:薇莲总是保持着冷静、沉着的态度,不论面对何种情况都能保持冷静。同时,她总是带有一定的神秘色彩,让人无法洞察她真正的想法和动机。她对任务非常认真,但很少会谈及自己的生活和过去,因此让人对她的身份感到好奇。
66
+ 语言风格:薇莲的语言简洁有力,通常只说必要的话语来传达她的意思。她的语气总是带有一丝威慑力,让人不敢轻易挑战她。
67
+ 行为特征:薇莲行动迅速而准确,总是在保持低调的同时完成任务。她具备很强的隐蔽能力,在执行任务的时候几乎不留痕迹,让人难以发现她的存在。不过,她也有时候会让人感到无法理解,经常出现在决定性瞬间,让人觉得她真正的动机仍旧是个谜。"""
68
+ input_virene = "我需要一个赏金猎人完成一个任务,听说您非常厉害。我们可以谈一下合作吗?"
69
+
70
+
71
+ # 角色2: (您可以自己创造一个新的角色来进行测试!)
72
+ instruction_new_role = """你现在是一个脾气火爆、说话直来直去,但内心充满正义感的退休老兵,名叫“老炮儿”。
73
+ 你的口头禅是“嘿,我说你小子...”。
74
+ 你的语言风格充满京味儿,简洁有力。"""
75
+ input_new_role = "大爷,问个路,这附近哪有吃饭的地方啊?"
76
+
77
+
78
+ # --- 选择一个角色进行测试 ---
79
+ # 您可以切换 instruction 和 input 来测试不同的角色
80
+ instruction_to_test = instruction_new_role
81
+ input_to_test = input_new_role
82
+
83
+ # 将“剧本”格式化
84
+ prompt = alpaca_prompt.format(
85
+ instruction_to_test,
86
+ input_to_test,
87
+ "", # Response部分留空,等待模型生成
88
+ )
89
+
90
+ print(" - 剧本已生成,准备开始推理!")
91
+
92
+ # --- 4. 开始推理 (见证奇迹的时刻) ---
93
+
94
+ print("\n✅ 步骤 4/4: 首演开始!正在生成对话...")
95
+
96
+ # 使用transformers的pipeline工具,这是最简单的推理方式
97
+ pipe = pipeline(
98
+ "text-generation",
99
+ model=model,
100
+ tokenizer=tokenizer
101
+ )
102
+
103
+ # 设置生成参数
104
+ generation_args = {
105
+ "max_new_tokens": 256, # 最多生成多少个新词
106
+ "do_sample": True, # 开启采样,让回答更多样
107
+ "temperature": 0.7, # 温度,越低回答越稳定,越高越有创造力
108
+ "top_p": 0.9, # Top-p采样,控制多样性
109
+ "top_k": 50, # Top-k采样,控制多样性
110
+ "pad_token_id": tokenizer.eos_token_id # 明确告知结束符
111
+ }
112
+
113
+ # 运行pipeline!
114
+ outputs = pipe(prompt, **generation_args)
115
+
116
+ # --- 打印结果 ---
117
+
118
+ print("\n\n==================== 🌟 演出结束 🌟 ====================")
119
+ print("完整的生成文本:\n")
120
+ print(outputs[0]['generated_text'])
121
+ print("\n========================================================")
122
+
123
+ # 只提取模型生成的部分,更清晰
124
+ response_part = outputs[0]['generated_text'].split("### Response:")[1].strip()
125
+ print("\n只看模型的回答部分:\n")
126
+ print(response_part)
127
+ print("\n========================================================")