wangyichen25 commited on
Commit
ce3bf67
·
verified ·
1 Parent(s): 63619b7

Training in progress, step 80, checkpoint

Browse files
checkpoint-80/README.md CHANGED
@@ -206,4 +206,5 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
206
  [More Information Needed]
207
  ### Framework versions
208
 
 
209
  - PEFT 0.17.0
 
206
  [More Information Needed]
207
  ### Framework versions
208
 
209
+ - PEFT 0.17.1
210
  - PEFT 0.17.0
checkpoint-80/adapter_config.json CHANGED
@@ -28,16 +28,16 @@
28
  "rank_pattern": {},
29
  "revision": null,
30
  "target_modules": [
31
- "gate_proj",
32
- "down_proj",
33
- "k_proj",
34
- "q_proj",
35
- "up_proj",
36
  "out_proj",
37
- "fc2",
38
  "o_proj",
39
- "fc1",
40
- "v_proj"
 
 
 
 
 
41
  ],
42
  "target_parameters": null,
43
  "task_type": "CAUSAL_LM",
 
28
  "rank_pattern": {},
29
  "revision": null,
30
  "target_modules": [
31
+ "fc1",
 
 
 
 
32
  "out_proj",
 
33
  "o_proj",
34
+ "up_proj",
35
+ "fc2",
36
+ "down_proj",
37
+ "q_proj",
38
+ "v_proj",
39
+ "gate_proj",
40
+ "k_proj"
41
  ],
42
  "target_parameters": null,
43
  "task_type": "CAUSAL_LM",
checkpoint-80/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b38fdc58f221299e173f4d29117d7c3557d9f9ccc7e564535135bd219c57e26c
3
  size 6127553104
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbc16fc88574d23b2a8ef23b626ea5b8d2ba9b068ae0fb038e7729f73e9bb63d
3
  size 6127553104
checkpoint-80/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e004d33a7ef5e1384aee2fd88d54dbc7274bd2f5f3ab31eeb1872d7eb5dd6b96
3
  size 12255795061
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47147ddfd768955a2389ba2c3c456cd779c8b4dc1df7a442302a77a60038a969
3
  size 12255795061
checkpoint-80/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02c27ca411a3d5c2d78fff32569312c9b77730b1446b5c1a51897f69007f95c3
3
+ size 14645
checkpoint-80/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:221067d3c8e0a8127ca80a998e2319c9553178d4670269f160050a27de5940f7
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e100b2eac31885bc32fe68c39794d75ce11196153bb3d071fa08e1bad94147db
3
  size 1465
checkpoint-80/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.6652806652806653,
6
  "eval_steps": 10,
7
  "global_step": 80,
8
  "is_hyper_param_search": false,
@@ -10,160 +10,176 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.08316008316008316,
14
- "grad_norm": 11.767539024353027,
15
- "learning_rate": 0.00019145299145299148,
16
- "loss": 18.0054,
17
- "mean_token_accuracy": 0.8393091425299645,
18
- "num_tokens": 323168.0,
 
19
  "step": 10
20
  },
21
  {
22
- "epoch": 0.08316008316008316,
23
- "eval_loss": 0.14767414331436157,
24
- "eval_mean_token_accuracy": 0.9865884414085975,
25
- "eval_num_tokens": 323168.0,
26
- "eval_runtime": 32.0686,
27
- "eval_samples_per_second": 6.237,
28
- "eval_steps_per_second": 0.405,
 
29
  "step": 10
30
  },
31
  {
32
- "epoch": 0.16632016632016633,
33
- "grad_norm": 4.4300336837768555,
34
- "learning_rate": 0.00017435897435897436,
35
- "loss": 1.6772,
36
- "mean_token_accuracy": 0.9893433898687363,
37
- "num_tokens": 646431.0,
 
38
  "step": 20
39
  },
40
  {
41
- "epoch": 0.16632016632016633,
42
- "eval_loss": 0.057891350239515305,
43
- "eval_mean_token_accuracy": 0.993429972575261,
44
- "eval_num_tokens": 646431.0,
45
- "eval_runtime": 32.7893,
46
- "eval_samples_per_second": 6.1,
47
- "eval_steps_per_second": 0.396,
 
48
  "step": 20
49
  },
50
  {
51
- "epoch": 0.2494802494802495,
52
- "grad_norm": 2.2051281929016113,
53
- "learning_rate": 0.00015726495726495727,
54
- "loss": 0.406,
55
- "mean_token_accuracy": 0.9940585166215896,
56
- "num_tokens": 969623.0,
 
57
  "step": 30
58
  },
59
  {
60
- "epoch": 0.2494802494802495,
61
- "eval_loss": 0.01031240914016962,
62
- "eval_mean_token_accuracy": 0.9945538227374737,
63
- "eval_num_tokens": 969623.0,
64
- "eval_runtime": 32.2752,
65
- "eval_samples_per_second": 6.197,
66
- "eval_steps_per_second": 0.403,
 
67
  "step": 30
68
  },
69
  {
70
- "epoch": 0.33264033264033266,
71
- "grad_norm": 0.7954460382461548,
72
- "learning_rate": 0.00014017094017094016,
73
- "loss": 0.156,
74
- "mean_token_accuracy": 0.9945382237434387,
75
- "num_tokens": 1292839.0,
 
76
  "step": 40
77
  },
78
  {
79
- "epoch": 0.33264033264033266,
80
- "eval_loss": 0.009185228496789932,
81
- "eval_mean_token_accuracy": 0.9949805828241202,
82
- "eval_num_tokens": 1292839.0,
83
- "eval_runtime": 31.808,
84
- "eval_samples_per_second": 6.288,
85
- "eval_steps_per_second": 0.409,
 
86
  "step": 40
87
  },
88
  {
89
- "epoch": 0.4158004158004158,
90
- "grad_norm": 0.5219862461090088,
91
- "learning_rate": 0.0001230769230769231,
92
- "loss": 0.1342,
93
- "mean_token_accuracy": 0.9951596394181251,
94
- "num_tokens": 1615982.0,
 
95
  "step": 50
96
  },
97
  {
98
- "epoch": 0.4158004158004158,
99
- "eval_loss": 0.008571554906666279,
100
- "eval_mean_token_accuracy": 0.9952132931122413,
101
- "eval_num_tokens": 1615982.0,
102
- "eval_runtime": 32.1863,
103
- "eval_samples_per_second": 6.214,
104
- "eval_steps_per_second": 0.404,
 
105
  "step": 50
106
  },
107
  {
108
- "epoch": 0.498960498960499,
109
- "grad_norm": 0.8362743258476257,
110
- "learning_rate": 0.000105982905982906,
111
- "loss": 0.1291,
112
- "mean_token_accuracy": 0.9950484573841095,
113
- "num_tokens": 1939204.0,
 
114
  "step": 60
115
  },
116
  {
117
- "epoch": 0.498960498960499,
118
- "eval_loss": 0.008207487873733044,
119
- "eval_mean_token_accuracy": 0.99532917371163,
120
- "eval_num_tokens": 1939204.0,
121
- "eval_runtime": 32.3309,
122
- "eval_samples_per_second": 6.186,
123
- "eval_steps_per_second": 0.402,
 
124
  "step": 60
125
  },
126
  {
127
- "epoch": 0.5821205821205822,
128
- "grad_norm": 0.8683068752288818,
129
- "learning_rate": 8.888888888888889e-05,
130
- "loss": 0.1313,
131
- "mean_token_accuracy": 0.9951223149895668,
132
- "num_tokens": 2262372.0,
 
133
  "step": 70
134
  },
135
  {
136
- "epoch": 0.5821205821205822,
137
- "eval_loss": 0.008194765076041222,
138
- "eval_mean_token_accuracy": 0.9951937015240009,
139
- "eval_num_tokens": 2262372.0,
140
- "eval_runtime": 32.1376,
141
- "eval_samples_per_second": 6.223,
142
- "eval_steps_per_second": 0.405,
 
143
  "step": 70
144
  },
145
  {
146
- "epoch": 0.6652806652806653,
147
- "grad_norm": 0.401977002620697,
148
- "learning_rate": 7.17948717948718e-05,
149
- "loss": 0.1151,
150
- "mean_token_accuracy": 0.995387016236782,
151
- "num_tokens": 2585532.0,
 
152
  "step": 80
153
  },
154
  {
155
- "epoch": 0.6652806652806653,
156
- "eval_loss": 0.007898409850895405,
157
- "eval_mean_token_accuracy": 0.9954066093151386,
158
- "eval_num_tokens": 2585532.0,
159
- "eval_runtime": 32.2572,
160
- "eval_samples_per_second": 6.2,
161
- "eval_steps_per_second": 0.403,
 
162
  "step": 80
163
  }
164
  ],
165
  "logging_steps": 10,
166
- "max_steps": 121,
167
  "num_input_tokens_seen": 0,
168
  "num_train_epochs": 1,
169
  "save_steps": 40,
@@ -179,7 +195,7 @@
179
  "attributes": {}
180
  }
181
  },
182
- "total_flos": 4.292117885846815e+17,
183
  "train_batch_size": 4,
184
  "trial_name": null,
185
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.05534417156693186,
6
  "eval_steps": 10,
7
  "global_step": 80,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 3.2428319215774537,
14
+ "epoch": 0.0069180214458664825,
15
+ "grad_norm": 45.59263229370117,
16
+ "learning_rate": 4.0909090909090915e-05,
17
+ "loss": 8.7229,
18
+ "mean_token_accuracy": 0.20135476849973202,
19
+ "num_tokens": 44798.0,
20
  "step": 10
21
  },
22
  {
23
+ "epoch": 0.0069180214458664825,
24
+ "eval_entropy": 3.4517827892303465,
25
+ "eval_loss": 7.022937774658203,
26
+ "eval_mean_token_accuracy": 0.29038591831922533,
27
+ "eval_num_tokens": 44798.0,
28
+ "eval_runtime": 42.5158,
29
+ "eval_samples_per_second": 4.704,
30
+ "eval_steps_per_second": 1.176,
31
  "step": 10
32
  },
33
  {
34
+ "entropy": 4.103338432312012,
35
+ "epoch": 0.013836042891732965,
36
+ "grad_norm": 15.53995418548584,
37
+ "learning_rate": 8.636363636363637e-05,
38
+ "loss": 5.0491,
39
+ "mean_token_accuracy": 0.44307171255350114,
40
+ "num_tokens": 89551.0,
41
  "step": 20
42
  },
43
  {
44
+ "epoch": 0.013836042891732965,
45
+ "eval_entropy": 4.955911350250244,
46
+ "eval_loss": 2.6293535232543945,
47
+ "eval_mean_token_accuracy": 0.6638811433315277,
48
+ "eval_num_tokens": 89551.0,
49
+ "eval_runtime": 42.5341,
50
+ "eval_samples_per_second": 4.702,
51
+ "eval_steps_per_second": 1.176,
52
  "step": 20
53
  },
54
  {
55
+ "entropy": 4.794859045743943,
56
+ "epoch": 0.020754064337599448,
57
+ "grad_norm": 8.58745002746582,
58
+ "learning_rate": 0.0001318181818181818,
59
+ "loss": 1.7872,
60
+ "mean_token_accuracy": 0.7792469739913941,
61
+ "num_tokens": 134427.0,
62
  "step": 30
63
  },
64
  {
65
+ "epoch": 0.020754064337599448,
66
+ "eval_entropy": 4.026243486404419,
67
+ "eval_loss": 1.0728627443313599,
68
+ "eval_mean_token_accuracy": 0.8464114594459534,
69
+ "eval_num_tokens": 134427.0,
70
+ "eval_runtime": 42.5437,
71
+ "eval_samples_per_second": 4.701,
72
+ "eval_steps_per_second": 1.175,
73
  "step": 30
74
  },
75
  {
76
+ "entropy": 2.9154508650302886,
77
+ "epoch": 0.02767208578346593,
78
+ "grad_norm": 5.161023139953613,
79
+ "learning_rate": 0.00017727272727272728,
80
+ "loss": 0.7894,
81
+ "mean_token_accuracy": 0.881743885576725,
82
+ "num_tokens": 179334.0,
83
  "step": 40
84
  },
85
  {
86
+ "epoch": 0.02767208578346593,
87
+ "eval_entropy": 1.3028265857696533,
88
+ "eval_loss": 0.27193209528923035,
89
+ "eval_mean_token_accuracy": 0.8934199070930481,
90
+ "eval_num_tokens": 179334.0,
91
+ "eval_runtime": 42.5195,
92
+ "eval_samples_per_second": 4.704,
93
+ "eval_steps_per_second": 1.176,
94
  "step": 40
95
  },
96
  {
97
+ "entropy": 0.9636951878666877,
98
+ "epoch": 0.03459010722933241,
99
+ "grad_norm": 5.691296100616455,
100
+ "learning_rate": 0.0001992867332382311,
101
+ "loss": 0.2666,
102
+ "mean_token_accuracy": 0.8942964091897011,
103
+ "num_tokens": 224191.0,
104
  "step": 50
105
  },
106
  {
107
+ "epoch": 0.03459010722933241,
108
+ "eval_entropy": 0.7585571753978729,
109
+ "eval_loss": 0.2833440601825714,
110
+ "eval_mean_token_accuracy": 0.9009752857685089,
111
+ "eval_num_tokens": 224191.0,
112
+ "eval_runtime": 42.6014,
113
+ "eval_samples_per_second": 4.695,
114
+ "eval_steps_per_second": 1.174,
115
  "step": 50
116
  },
117
  {
118
+ "entropy": 0.7302425026893615,
119
+ "epoch": 0.041508128675198895,
120
+ "grad_norm": 2.4715988636016846,
121
+ "learning_rate": 0.0001978601997146933,
122
+ "loss": 0.273,
123
+ "mean_token_accuracy": 0.888365663588047,
124
+ "num_tokens": 269084.0,
125
  "step": 60
126
  },
127
  {
128
+ "epoch": 0.041508128675198895,
129
+ "eval_entropy": 0.8055200433731079,
130
+ "eval_loss": 0.21239124238491058,
131
+ "eval_mean_token_accuracy": 0.9023281943798065,
132
+ "eval_num_tokens": 269084.0,
133
+ "eval_runtime": 42.5395,
134
+ "eval_samples_per_second": 4.702,
135
+ "eval_steps_per_second": 1.175,
136
  "step": 60
137
  },
138
  {
139
+ "entropy": 0.7857675984501838,
140
+ "epoch": 0.048426150121065374,
141
+ "grad_norm": 1.5094635486602783,
142
+ "learning_rate": 0.0001964336661911555,
143
+ "loss": 0.1984,
144
+ "mean_token_accuracy": 0.9075267255306244,
145
+ "num_tokens": 313857.0,
146
  "step": 70
147
  },
148
  {
149
+ "epoch": 0.048426150121065374,
150
+ "eval_entropy": 0.7421065926551819,
151
+ "eval_loss": 0.17795822024345398,
152
+ "eval_mean_token_accuracy": 0.9092637586593628,
153
+ "eval_num_tokens": 313857.0,
154
+ "eval_runtime": 42.718,
155
+ "eval_samples_per_second": 4.682,
156
+ "eval_steps_per_second": 1.17,
157
  "step": 70
158
  },
159
  {
160
+ "entropy": 0.7267766013741493,
161
+ "epoch": 0.05534417156693186,
162
+ "grad_norm": 1.624765157699585,
163
+ "learning_rate": 0.0001950071326676177,
164
+ "loss": 0.1844,
165
+ "mean_token_accuracy": 0.9070346429944038,
166
+ "num_tokens": 358622.0,
167
  "step": 80
168
  },
169
  {
170
+ "epoch": 0.05534417156693186,
171
+ "eval_entropy": 0.8165990447998047,
172
+ "eval_loss": 0.18048767745494843,
173
+ "eval_mean_token_accuracy": 0.9136576187610627,
174
+ "eval_num_tokens": 358622.0,
175
+ "eval_runtime": 42.5831,
176
+ "eval_samples_per_second": 4.697,
177
+ "eval_steps_per_second": 1.174,
178
  "step": 80
179
  }
180
  ],
181
  "logging_steps": 10,
182
+ "max_steps": 1446,
183
  "num_input_tokens_seen": 0,
184
  "num_train_epochs": 1,
185
  "save_steps": 40,
 
195
  "attributes": {}
196
  }
197
  },
198
+ "total_flos": 6.082356697673088e+16,
199
  "train_batch_size": 4,
200
  "trial_name": null,
201
  "trial_params": null
checkpoint-80/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd81184c4386bdd5320f1754d4cda79540e3bb45d4e9eeffadfdb4c17e09fef2
3
- size 6353
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c665ff9710ba066622bdc47a0845adeeeb156957d33148906e62f67561245a3f
3
+ size 6481