wangyichen25 commited on
Commit
e97878f
·
verified ·
1 Parent(s): c893c0c

Training in progress, step 120, checkpoint

Browse files
checkpoint-120/README.md CHANGED
@@ -206,4 +206,5 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
206
  [More Information Needed]
207
  ### Framework versions
208
 
 
209
  - PEFT 0.17.0
 
206
  [More Information Needed]
207
  ### Framework versions
208
 
209
+ - PEFT 0.17.1
210
  - PEFT 0.17.0
checkpoint-120/adapter_config.json CHANGED
@@ -28,16 +28,16 @@
28
  "rank_pattern": {},
29
  "revision": null,
30
  "target_modules": [
31
- "gate_proj",
32
- "down_proj",
33
- "k_proj",
34
- "q_proj",
35
- "up_proj",
36
  "out_proj",
37
- "fc2",
38
  "o_proj",
39
- "fc1",
40
- "v_proj"
 
 
 
 
 
41
  ],
42
  "target_parameters": null,
43
  "task_type": "CAUSAL_LM",
 
28
  "rank_pattern": {},
29
  "revision": null,
30
  "target_modules": [
31
+ "fc1",
 
 
 
 
32
  "out_proj",
 
33
  "o_proj",
34
+ "up_proj",
35
+ "fc2",
36
+ "down_proj",
37
+ "q_proj",
38
+ "v_proj",
39
+ "gate_proj",
40
+ "k_proj"
41
  ],
42
  "target_parameters": null,
43
  "task_type": "CAUSAL_LM",
checkpoint-120/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a63158af57cbbb839baf13c75b5b7e43c917c3b77e730742ef7255b3e7476392
3
  size 6127553104
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4719fe220a3d42f1d99077b29605dce4e46d514fa3df7cb4806703e37700baa6
3
  size 6127553104
checkpoint-120/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1c42f8f8f0563636de7a5103cc002f89c6b220220d699bd41c5d5dd4bc8d85c
3
  size 12255795061
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f89f816815b7e2d82741bc75254dd212589d1178c85734f0b6b74446908ee0b
3
  size 12255795061
checkpoint-120/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b6e2d6d9339ca2e421dac098c5e7d65ad9fb55247e8cfbce3aa56958f70cda6
3
+ size 14645
checkpoint-120/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:293f7c577c9a7e0432f26bd9e01d04d35bfd71833c40176261e1d41887787c1a
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6b282f4e35f05792bc7cf401176337b080e060e2f8a00bbaa0daa1f582160e6
3
  size 1465
checkpoint-120/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.997920997920998,
6
  "eval_steps": 10,
7
  "global_step": 120,
8
  "is_hyper_param_search": false,
@@ -10,236 +10,260 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.08316008316008316,
14
- "grad_norm": 11.767539024353027,
15
- "learning_rate": 0.00019145299145299148,
16
- "loss": 18.0054,
17
- "mean_token_accuracy": 0.8393091425299645,
18
- "num_tokens": 323168.0,
 
19
  "step": 10
20
  },
21
  {
22
- "epoch": 0.08316008316008316,
23
- "eval_loss": 0.14767414331436157,
24
- "eval_mean_token_accuracy": 0.9865884414085975,
25
- "eval_num_tokens": 323168.0,
26
- "eval_runtime": 32.0686,
27
- "eval_samples_per_second": 6.237,
28
- "eval_steps_per_second": 0.405,
 
29
  "step": 10
30
  },
31
  {
32
- "epoch": 0.16632016632016633,
33
- "grad_norm": 4.4300336837768555,
34
- "learning_rate": 0.00017435897435897436,
35
- "loss": 1.6772,
36
- "mean_token_accuracy": 0.9893433898687363,
37
- "num_tokens": 646431.0,
 
38
  "step": 20
39
  },
40
  {
41
- "epoch": 0.16632016632016633,
42
- "eval_loss": 0.057891350239515305,
43
- "eval_mean_token_accuracy": 0.993429972575261,
44
- "eval_num_tokens": 646431.0,
45
- "eval_runtime": 32.7893,
46
- "eval_samples_per_second": 6.1,
47
- "eval_steps_per_second": 0.396,
 
48
  "step": 20
49
  },
50
  {
51
- "epoch": 0.2494802494802495,
52
- "grad_norm": 2.2051281929016113,
53
- "learning_rate": 0.00015726495726495727,
54
- "loss": 0.406,
55
- "mean_token_accuracy": 0.9940585166215896,
56
- "num_tokens": 969623.0,
 
57
  "step": 30
58
  },
59
  {
60
- "epoch": 0.2494802494802495,
61
- "eval_loss": 0.01031240914016962,
62
- "eval_mean_token_accuracy": 0.9945538227374737,
63
- "eval_num_tokens": 969623.0,
64
- "eval_runtime": 32.2752,
65
- "eval_samples_per_second": 6.197,
66
- "eval_steps_per_second": 0.403,
 
67
  "step": 30
68
  },
69
  {
70
- "epoch": 0.33264033264033266,
71
- "grad_norm": 0.7954460382461548,
72
- "learning_rate": 0.00014017094017094016,
73
- "loss": 0.156,
74
- "mean_token_accuracy": 0.9945382237434387,
75
- "num_tokens": 1292839.0,
 
76
  "step": 40
77
  },
78
  {
79
- "epoch": 0.33264033264033266,
80
- "eval_loss": 0.009185228496789932,
81
- "eval_mean_token_accuracy": 0.9949805828241202,
82
- "eval_num_tokens": 1292839.0,
83
- "eval_runtime": 31.808,
84
- "eval_samples_per_second": 6.288,
85
- "eval_steps_per_second": 0.409,
 
86
  "step": 40
87
  },
88
  {
89
- "epoch": 0.4158004158004158,
90
- "grad_norm": 0.5219862461090088,
91
- "learning_rate": 0.0001230769230769231,
92
- "loss": 0.1342,
93
- "mean_token_accuracy": 0.9951596394181251,
94
- "num_tokens": 1615982.0,
 
95
  "step": 50
96
  },
97
  {
98
- "epoch": 0.4158004158004158,
99
- "eval_loss": 0.008571554906666279,
100
- "eval_mean_token_accuracy": 0.9952132931122413,
101
- "eval_num_tokens": 1615982.0,
102
- "eval_runtime": 32.1863,
103
- "eval_samples_per_second": 6.214,
104
- "eval_steps_per_second": 0.404,
 
105
  "step": 50
106
  },
107
  {
108
- "epoch": 0.498960498960499,
109
- "grad_norm": 0.8362743258476257,
110
- "learning_rate": 0.000105982905982906,
111
- "loss": 0.1291,
112
- "mean_token_accuracy": 0.9950484573841095,
113
- "num_tokens": 1939204.0,
 
114
  "step": 60
115
  },
116
  {
117
- "epoch": 0.498960498960499,
118
- "eval_loss": 0.008207487873733044,
119
- "eval_mean_token_accuracy": 0.99532917371163,
120
- "eval_num_tokens": 1939204.0,
121
- "eval_runtime": 32.3309,
122
- "eval_samples_per_second": 6.186,
123
- "eval_steps_per_second": 0.402,
 
124
  "step": 60
125
  },
126
  {
127
- "epoch": 0.5821205821205822,
128
- "grad_norm": 0.8683068752288818,
129
- "learning_rate": 8.888888888888889e-05,
130
- "loss": 0.1313,
131
- "mean_token_accuracy": 0.9951223149895668,
132
- "num_tokens": 2262372.0,
 
133
  "step": 70
134
  },
135
  {
136
- "epoch": 0.5821205821205822,
137
- "eval_loss": 0.008194765076041222,
138
- "eval_mean_token_accuracy": 0.9951937015240009,
139
- "eval_num_tokens": 2262372.0,
140
- "eval_runtime": 32.1376,
141
- "eval_samples_per_second": 6.223,
142
- "eval_steps_per_second": 0.405,
 
143
  "step": 70
144
  },
145
  {
146
- "epoch": 0.6652806652806653,
147
- "grad_norm": 0.401977002620697,
148
- "learning_rate": 7.17948717948718e-05,
149
- "loss": 0.1151,
150
- "mean_token_accuracy": 0.995387016236782,
151
- "num_tokens": 2585532.0,
 
152
  "step": 80
153
  },
154
  {
155
- "epoch": 0.6652806652806653,
156
- "eval_loss": 0.007898409850895405,
157
- "eval_mean_token_accuracy": 0.9954066093151386,
158
- "eval_num_tokens": 2585532.0,
159
- "eval_runtime": 32.2572,
160
- "eval_samples_per_second": 6.2,
161
- "eval_steps_per_second": 0.403,
 
162
  "step": 80
163
  },
164
  {
165
- "epoch": 0.7484407484407485,
166
- "grad_norm": 0.5641180276870728,
167
- "learning_rate": 5.470085470085471e-05,
168
- "loss": 0.1226,
169
- "mean_token_accuracy": 0.9952676251530648,
170
- "num_tokens": 2908697.0,
 
171
  "step": 90
172
  },
173
  {
174
- "epoch": 0.7484407484407485,
175
- "eval_loss": 0.007712052669376135,
176
- "eval_mean_token_accuracy": 0.9953293800354004,
177
- "eval_num_tokens": 2908697.0,
178
- "eval_runtime": 32.1784,
179
- "eval_samples_per_second": 6.215,
180
- "eval_steps_per_second": 0.404,
 
181
  "step": 90
182
  },
183
  {
184
- "epoch": 0.8316008316008316,
185
- "grad_norm": 0.4296090602874756,
186
- "learning_rate": 3.760683760683761e-05,
187
- "loss": 0.1181,
188
- "mean_token_accuracy": 0.9952058300375939,
189
- "num_tokens": 3231892.0,
 
190
  "step": 100
191
  },
192
  {
193
- "epoch": 0.8316008316008316,
194
- "eval_loss": 0.0073149907402694225,
195
- "eval_mean_token_accuracy": 0.99550382907574,
196
- "eval_num_tokens": 3231892.0,
197
- "eval_runtime": 32.1177,
198
- "eval_samples_per_second": 6.227,
199
- "eval_steps_per_second": 0.405,
 
200
  "step": 100
201
  },
202
  {
203
- "epoch": 0.9147609147609148,
204
- "grad_norm": 0.3773857057094574,
205
- "learning_rate": 2.0512820512820512e-05,
206
- "loss": 0.1155,
207
- "mean_token_accuracy": 0.99525695592165,
208
- "num_tokens": 3555132.0,
 
209
  "step": 110
210
  },
211
  {
212
- "epoch": 0.9147609147609148,
213
- "eval_loss": 0.007079997565597296,
214
- "eval_mean_token_accuracy": 0.9955812417543851,
215
- "eval_num_tokens": 3555132.0,
216
- "eval_runtime": 30.7586,
217
- "eval_samples_per_second": 6.502,
218
- "eval_steps_per_second": 0.423,
 
219
  "step": 110
220
  },
221
  {
222
- "epoch": 0.997920997920998,
223
- "grad_norm": 0.2652049958705902,
224
- "learning_rate": 3.4188034188034193e-06,
225
- "loss": 0.1145,
226
- "mean_token_accuracy": 0.9952491670846939,
227
- "num_tokens": 3878323.0,
 
228
  "step": 120
229
  },
230
  {
231
- "epoch": 0.997920997920998,
232
- "eval_loss": 0.006998998112976551,
233
- "eval_mean_token_accuracy": 0.9955618839997512,
234
- "eval_num_tokens": 3878323.0,
235
- "eval_runtime": 30.145,
236
- "eval_samples_per_second": 6.635,
237
- "eval_steps_per_second": 0.431,
 
238
  "step": 120
239
  }
240
  ],
241
  "logging_steps": 10,
242
- "max_steps": 121,
243
  "num_input_tokens_seen": 0,
244
  "num_train_epochs": 1,
245
  "save_steps": 40,
@@ -255,7 +279,7 @@
255
  "attributes": {}
256
  }
257
  },
258
- "total_flos": 6.438173500170568e+17,
259
  "train_batch_size": 4,
260
  "trial_name": null,
261
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.08301625735039779,
6
  "eval_steps": 10,
7
  "global_step": 120,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 3.2428319215774537,
14
+ "epoch": 0.0069180214458664825,
15
+ "grad_norm": 45.59263229370117,
16
+ "learning_rate": 4.0909090909090915e-05,
17
+ "loss": 8.7229,
18
+ "mean_token_accuracy": 0.20135476849973202,
19
+ "num_tokens": 44798.0,
20
  "step": 10
21
  },
22
  {
23
+ "epoch": 0.0069180214458664825,
24
+ "eval_entropy": 3.4517827892303465,
25
+ "eval_loss": 7.022937774658203,
26
+ "eval_mean_token_accuracy": 0.29038591831922533,
27
+ "eval_num_tokens": 44798.0,
28
+ "eval_runtime": 42.5158,
29
+ "eval_samples_per_second": 4.704,
30
+ "eval_steps_per_second": 1.176,
31
  "step": 10
32
  },
33
  {
34
+ "entropy": 4.103338432312012,
35
+ "epoch": 0.013836042891732965,
36
+ "grad_norm": 15.53995418548584,
37
+ "learning_rate": 8.636363636363637e-05,
38
+ "loss": 5.0491,
39
+ "mean_token_accuracy": 0.44307171255350114,
40
+ "num_tokens": 89551.0,
41
  "step": 20
42
  },
43
  {
44
+ "epoch": 0.013836042891732965,
45
+ "eval_entropy": 4.955911350250244,
46
+ "eval_loss": 2.6293535232543945,
47
+ "eval_mean_token_accuracy": 0.6638811433315277,
48
+ "eval_num_tokens": 89551.0,
49
+ "eval_runtime": 42.5341,
50
+ "eval_samples_per_second": 4.702,
51
+ "eval_steps_per_second": 1.176,
52
  "step": 20
53
  },
54
  {
55
+ "entropy": 4.794859045743943,
56
+ "epoch": 0.020754064337599448,
57
+ "grad_norm": 8.58745002746582,
58
+ "learning_rate": 0.0001318181818181818,
59
+ "loss": 1.7872,
60
+ "mean_token_accuracy": 0.7792469739913941,
61
+ "num_tokens": 134427.0,
62
  "step": 30
63
  },
64
  {
65
+ "epoch": 0.020754064337599448,
66
+ "eval_entropy": 4.026243486404419,
67
+ "eval_loss": 1.0728627443313599,
68
+ "eval_mean_token_accuracy": 0.8464114594459534,
69
+ "eval_num_tokens": 134427.0,
70
+ "eval_runtime": 42.5437,
71
+ "eval_samples_per_second": 4.701,
72
+ "eval_steps_per_second": 1.175,
73
  "step": 30
74
  },
75
  {
76
+ "entropy": 2.9154508650302886,
77
+ "epoch": 0.02767208578346593,
78
+ "grad_norm": 5.161023139953613,
79
+ "learning_rate": 0.00017727272727272728,
80
+ "loss": 0.7894,
81
+ "mean_token_accuracy": 0.881743885576725,
82
+ "num_tokens": 179334.0,
83
  "step": 40
84
  },
85
  {
86
+ "epoch": 0.02767208578346593,
87
+ "eval_entropy": 1.3028265857696533,
88
+ "eval_loss": 0.27193209528923035,
89
+ "eval_mean_token_accuracy": 0.8934199070930481,
90
+ "eval_num_tokens": 179334.0,
91
+ "eval_runtime": 42.5195,
92
+ "eval_samples_per_second": 4.704,
93
+ "eval_steps_per_second": 1.176,
94
  "step": 40
95
  },
96
  {
97
+ "entropy": 0.9636951878666877,
98
+ "epoch": 0.03459010722933241,
99
+ "grad_norm": 5.691296100616455,
100
+ "learning_rate": 0.0001992867332382311,
101
+ "loss": 0.2666,
102
+ "mean_token_accuracy": 0.8942964091897011,
103
+ "num_tokens": 224191.0,
104
  "step": 50
105
  },
106
  {
107
+ "epoch": 0.03459010722933241,
108
+ "eval_entropy": 0.7585571753978729,
109
+ "eval_loss": 0.2833440601825714,
110
+ "eval_mean_token_accuracy": 0.9009752857685089,
111
+ "eval_num_tokens": 224191.0,
112
+ "eval_runtime": 42.6014,
113
+ "eval_samples_per_second": 4.695,
114
+ "eval_steps_per_second": 1.174,
115
  "step": 50
116
  },
117
  {
118
+ "entropy": 0.7302425026893615,
119
+ "epoch": 0.041508128675198895,
120
+ "grad_norm": 2.4715988636016846,
121
+ "learning_rate": 0.0001978601997146933,
122
+ "loss": 0.273,
123
+ "mean_token_accuracy": 0.888365663588047,
124
+ "num_tokens": 269084.0,
125
  "step": 60
126
  },
127
  {
128
+ "epoch": 0.041508128675198895,
129
+ "eval_entropy": 0.8055200433731079,
130
+ "eval_loss": 0.21239124238491058,
131
+ "eval_mean_token_accuracy": 0.9023281943798065,
132
+ "eval_num_tokens": 269084.0,
133
+ "eval_runtime": 42.5395,
134
+ "eval_samples_per_second": 4.702,
135
+ "eval_steps_per_second": 1.175,
136
  "step": 60
137
  },
138
  {
139
+ "entropy": 0.7857675984501838,
140
+ "epoch": 0.048426150121065374,
141
+ "grad_norm": 1.5094635486602783,
142
+ "learning_rate": 0.0001964336661911555,
143
+ "loss": 0.1984,
144
+ "mean_token_accuracy": 0.9075267255306244,
145
+ "num_tokens": 313857.0,
146
  "step": 70
147
  },
148
  {
149
+ "epoch": 0.048426150121065374,
150
+ "eval_entropy": 0.7421065926551819,
151
+ "eval_loss": 0.17795822024345398,
152
+ "eval_mean_token_accuracy": 0.9092637586593628,
153
+ "eval_num_tokens": 313857.0,
154
+ "eval_runtime": 42.718,
155
+ "eval_samples_per_second": 4.682,
156
+ "eval_steps_per_second": 1.17,
157
  "step": 70
158
  },
159
  {
160
+ "entropy": 0.7267766013741493,
161
+ "epoch": 0.05534417156693186,
162
+ "grad_norm": 1.624765157699585,
163
+ "learning_rate": 0.0001950071326676177,
164
+ "loss": 0.1844,
165
+ "mean_token_accuracy": 0.9070346429944038,
166
+ "num_tokens": 358622.0,
167
  "step": 80
168
  },
169
  {
170
+ "epoch": 0.05534417156693186,
171
+ "eval_entropy": 0.8165990447998047,
172
+ "eval_loss": 0.18048767745494843,
173
+ "eval_mean_token_accuracy": 0.9136576187610627,
174
+ "eval_num_tokens": 358622.0,
175
+ "eval_runtime": 42.5831,
176
+ "eval_samples_per_second": 4.697,
177
+ "eval_steps_per_second": 1.174,
178
  "step": 80
179
  },
180
  {
181
+ "entropy": 1.002037839591503,
182
+ "epoch": 0.06226219301279834,
183
+ "grad_norm": 1.328676462173462,
184
+ "learning_rate": 0.0001935805991440799,
185
+ "loss": 0.1736,
186
+ "mean_token_accuracy": 0.9099974319338798,
187
+ "num_tokens": 403518.0,
188
  "step": 90
189
  },
190
  {
191
+ "epoch": 0.06226219301279834,
192
+ "eval_entropy": 1.1328275966644288,
193
+ "eval_loss": 0.15943188965320587,
194
+ "eval_mean_token_accuracy": 0.9157200062274933,
195
+ "eval_num_tokens": 403518.0,
196
+ "eval_runtime": 42.7297,
197
+ "eval_samples_per_second": 4.681,
198
+ "eval_steps_per_second": 1.17,
199
  "step": 90
200
  },
201
  {
202
+ "entropy": 1.098360726237297,
203
+ "epoch": 0.06918021445866482,
204
+ "grad_norm": 1.5309491157531738,
205
+ "learning_rate": 0.0001921540656205421,
206
+ "loss": 0.1725,
207
+ "mean_token_accuracy": 0.9140415117144585,
208
+ "num_tokens": 448405.0,
209
  "step": 100
210
  },
211
  {
212
+ "epoch": 0.06918021445866482,
213
+ "eval_entropy": 1.028436896800995,
214
+ "eval_loss": 0.16933326423168182,
215
+ "eval_mean_token_accuracy": 0.915260488986969,
216
+ "eval_num_tokens": 448405.0,
217
+ "eval_runtime": 42.7333,
218
+ "eval_samples_per_second": 4.68,
219
+ "eval_steps_per_second": 1.17,
220
  "step": 100
221
  },
222
  {
223
+ "entropy": 1.0331062585115434,
224
+ "epoch": 0.0760982359045313,
225
+ "grad_norm": 1.9411410093307495,
226
+ "learning_rate": 0.00019072753209700428,
227
+ "loss": 0.1761,
228
+ "mean_token_accuracy": 0.9094761416316033,
229
+ "num_tokens": 493303.0,
230
  "step": 110
231
  },
232
  {
233
+ "epoch": 0.0760982359045313,
234
+ "eval_entropy": 1.0022910988330842,
235
+ "eval_loss": 0.16668693721294403,
236
+ "eval_mean_token_accuracy": 0.9155051994323731,
237
+ "eval_num_tokens": 493303.0,
238
+ "eval_runtime": 42.6346,
239
+ "eval_samples_per_second": 4.691,
240
+ "eval_steps_per_second": 1.173,
241
  "step": 110
242
  },
243
  {
244
+ "entropy": 1.0691627204418181,
245
+ "epoch": 0.08301625735039779,
246
+ "grad_norm": 1.2629178762435913,
247
+ "learning_rate": 0.00018930099857346648,
248
+ "loss": 0.1788,
249
+ "mean_token_accuracy": 0.9090283781290054,
250
+ "num_tokens": 538145.0,
251
  "step": 120
252
  },
253
  {
254
+ "epoch": 0.08301625735039779,
255
+ "eval_entropy": 1.4147576117515563,
256
+ "eval_loss": 0.15232698619365692,
257
+ "eval_mean_token_accuracy": 0.9161913430690766,
258
+ "eval_num_tokens": 538145.0,
259
+ "eval_runtime": 42.5837,
260
+ "eval_samples_per_second": 4.697,
261
+ "eval_steps_per_second": 1.174,
262
  "step": 120
263
  }
264
  ],
265
  "logging_steps": 10,
266
+ "max_steps": 1446,
267
  "num_input_tokens_seen": 0,
268
  "num_train_epochs": 1,
269
  "save_steps": 40,
 
279
  "attributes": {}
280
  }
281
  },
282
+ "total_flos": 9.133981968547162e+16,
283
  "train_batch_size": 4,
284
  "trial_name": null,
285
  "trial_params": null
checkpoint-120/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd81184c4386bdd5320f1754d4cda79540e3bb45d4e9eeffadfdb4c17e09fef2
3
- size 6353
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c665ff9710ba066622bdc47a0845adeeeb156957d33148906e62f67561245a3f
3
+ size 6481