AbstractPhil commited on
Commit
1470196
·
verified ·
1 Parent(s): d4cbab1

Crystal-Beeper-Harmony-v5 export @ 2025-08-19 01:35:41

Browse files
beeper_final.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f12e8ec36985a6ee2cd6fbf73ce5a4f4201c2df6c9ee9e7b52a5a8df1672ac0c
3
+ size 105808994
beeper_final.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39e90cb38df57c2443a85076b2d775364793cfc92d288d695d3642be9071796c
3
+ size 105789640
codec.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 260,
3
+ "bos": 256,
4
+ "eos": 257,
5
+ "pad": 258,
6
+ "unk": 259,
7
+ "punct_chars": [
8
+ ".",
9
+ ",",
10
+ ";",
11
+ ":",
12
+ "!",
13
+ "?",
14
+ "'",
15
+ "\"",
16
+ "(",
17
+ ")",
18
+ "[",
19
+ "]",
20
+ "{",
21
+ "}",
22
+ "-",
23
+ "\u2014",
24
+ "\u2026"
25
+ ]
26
+ }
model_config.json ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "Crystal-Beeper-Harmony-v5",
3
+ "context": 512,
4
+ "dim": 512,
5
+ "n_layers": 6,
6
+ "n_heads": 8,
7
+ "mlp_ratio": 4.0,
8
+ "dropout": 0.0,
9
+ "resid_dropout": 0.1,
10
+ "grad_checkpoint": false,
11
+ "compile_model": false,
12
+ "use_ascii": true,
13
+ "vocab_size": 8192,
14
+ "regions_per_block": 64,
15
+ "capoera": {
16
+ "enable": true,
17
+ "topic_bins": 512,
18
+ "mood_bins": 7
19
+ },
20
+ "context_mask_style": "right",
21
+ "_alive_entries": [
22
+ {
23
+ "name": "TinyStories",
24
+ "path": "roneneldan/TinyStories",
25
+ "split": "train[30%:50%]",
26
+ "weight": 0.1,
27
+ "dialect": [
28
+ 0.6000000238418579,
29
+ 0.10000000149011612,
30
+ 0.05000000074505806,
31
+ 0.05000000074505806,
32
+ 0.20000000298023224
33
+ ],
34
+ "class_id": 0,
35
+ "p": 0.03125000000000001
36
+ },
37
+ {
38
+ "name": "WikipediaEN",
39
+ "path": "wikimedia/wikipedia",
40
+ "config": "20231101.en",
41
+ "split": "train[5%:15%]",
42
+ "weight": 0.5,
43
+ "dialect": [
44
+ 0.11999999731779099,
45
+ 0.5799999833106995,
46
+ 0.10000000149011612,
47
+ 0.10000000149011612,
48
+ 0.10000000149011612
49
+ ],
50
+ "class_id": 1,
51
+ "p": 0.15625
52
+ },
53
+ {
54
+ "name": "AGNews",
55
+ "path": "ag_news",
56
+ "split": "train[:]",
57
+ "weight": 0.1,
58
+ "dialect": [
59
+ 0.20000000298023224,
60
+ 0.5,
61
+ 0.10000000149011612,
62
+ 0.10000000149011612,
63
+ 0.10000000149011612
64
+ ],
65
+ "class_id": 2,
66
+ "p": 0.03125000000000001
67
+ },
68
+ {
69
+ "name": "GSM8K",
70
+ "path": "openai/gsm8k",
71
+ "config": "main",
72
+ "split": "train[40%:60%]",
73
+ "weight": 0.6,
74
+ "dialect": [
75
+ 0.10000000149011612,
76
+ 0.15000000596046448,
77
+ 0.5,
78
+ 0.15000000596046448,
79
+ 0.10000000149011612
80
+ ],
81
+ "class_id": 3,
82
+ "p": 0.1875
83
+ },
84
+ {
85
+ "name": "AI2-ARC-Easy",
86
+ "path": "allenai/ai2_arc",
87
+ "config": "ARC-Easy",
88
+ "split": "train[30%:60%]",
89
+ "weight": 0.6,
90
+ "dialect": [
91
+ 0.05000000074505806,
92
+ 0.15000000596046448,
93
+ 0.4000000059604645,
94
+ 0.25,
95
+ 0.15000000596046448
96
+ ],
97
+ "class_id": 4,
98
+ "p": 0.1875
99
+ },
100
+ {
101
+ "name": "HH-RLHF",
102
+ "path": "Anthropic/hh-rlhf",
103
+ "split": "train[5%:10%]",
104
+ "weight": 0.5,
105
+ "dialect": [
106
+ 0.10000000149011612,
107
+ 0.25,
108
+ 0.20000000298023224,
109
+ 0.25,
110
+ 0.20000000298023224
111
+ ],
112
+ "class_id": 5,
113
+ "p": 0.15625
114
+ },
115
+ {
116
+ "name": "SVAMP",
117
+ "path": "ChilleD/SVAMP",
118
+ "split": "train",
119
+ "weight": 0.25,
120
+ "dialect": [
121
+ 0.10000000149011612,
122
+ 0.15000000596046448,
123
+ 0.550000011920929,
124
+ 0.15000000596046448,
125
+ 0.05000000074505806
126
+ ],
127
+ "class_id": 6,
128
+ "p": 0.078125
129
+ },
130
+ {
131
+ "name": "MATH-500",
132
+ "path": "HuggingFaceH4/MATH-500",
133
+ "split": "test",
134
+ "weight": 0.25,
135
+ "dialect": [
136
+ 0.05000000074505806,
137
+ 0.15000000596046448,
138
+ 0.6000000238418579,
139
+ 0.15000000596046448,
140
+ 0.05000000074505806
141
+ ],
142
+ "class_id": 7,
143
+ "p": 0.078125
144
+ },
145
+ {
146
+ "name": "SEP",
147
+ "path": "AiresPucrs/stanford-encyclopedia-philosophy",
148
+ "split": "train",
149
+ "weight": 0.3,
150
+ "dialect": [
151
+ 0.05000000074505806,
152
+ 0.44999998807907104,
153
+ 0.18000000715255737,
154
+ 0.2199999988079071,
155
+ 0.10000000149011612
156
+ ],
157
+ "class_id": 8,
158
+ "p": 0.09375
159
+ }
160
+ ]
161
+ }
training_config.json ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "batch_size": 128,
3
+ "grad_accum_steps": 1,
4
+ "epochs": 10,
5
+ "lr": 0.0003,
6
+ "betas": [
7
+ 0.9,
8
+ 0.95
9
+ ],
10
+ "weight_decay": 0.1,
11
+ "warmup_steps": 500,
12
+ "max_steps": null,
13
+ "clip_grad": 1.0,
14
+ "min_lr": 1e-06,
15
+ "label_smoothing": 0.0,
16
+ "mixed_precision": "bf16",
17
+ "log_dir": "./runs/crystal_beeper",
18
+ "log_interval": 50,
19
+ "ckpt_dir": "./checkpoints_crystal",
20
+ "export_dir": "./export_crystal",
21
+ "resume": true,
22
+ "resume_strict": false,
23
+ "resume_tag": "best_model.safetensors",
24
+ "hf_repo": "AbstractPhil/beeper-ascii-v1",
25
+ "upload_to_hub": true,
26
+ "add_bos_eos": true,
27
+ "span_corrupt_frac": 0.0,
28
+ "val_ratio": 0.01,
29
+ "test_ratio": 0.01,
30
+ "max_rows_per_dataset": null,
31
+ "dataset_cache_verbose": true,
32
+ "lambda_route": 0.2,
33
+ "route_topk": 32,
34
+ "lambda_geom": 0.3,
35
+ "lambda_geom_angle": 0.8,
36
+ "lambda_geom_var": 0.3,
37
+ "lambda_geom_edge": 0.3,
38
+ "lambda_geom_vol": 0.6,
39
+ "lambda_geom_minrel": 1.0,
40
+ "geom_min_edge_rel": 0.6,
41
+ "geom_vol_lower_frac": 0.85,
42
+ "geom_sample_classes": 64,
43
+ "lambda_rose": 0.1,
44
+ "rose_scale": 1.8,
45
+ "contrast_warmup": 800,
46
+ "pent_temp": 0.1,
47
+ "lambda_contrast": 0.25,
48
+ "punctuation": {
49
+ "enable": true,
50
+ "chars": [
51
+ ".",
52
+ ",",
53
+ ";",
54
+ ":",
55
+ "!",
56
+ "?",
57
+ "'",
58
+ "\"",
59
+ "(",
60
+ ")",
61
+ "[",
62
+ "]",
63
+ "{",
64
+ "}",
65
+ "-",
66
+ "\u2014",
67
+ "\u2026"
68
+ ],
69
+ "alpha_soft": 0.6,
70
+ "hard_mask_gate": false,
71
+ "apply_to_coarse_ids": "ALL"
72
+ },
73
+ "harmony": {
74
+ "apply": true,
75
+ "system": "You are Crystal-Beeper, a helpful, honest, precise assistant.",
76
+ "style": "concise"
77
+ },
78
+ "stages": [
79
+ {
80
+ "name": "bootstrap",
81
+ "epochs": 1,
82
+ "lambda_route": 0.05,
83
+ "lambda_geom": 0.2,
84
+ "gate_tau": 0.1,
85
+ "punct_alpha": 0.6,
86
+ "hard_mask_gate": false,
87
+ "mix_sdpa": 1.0
88
+ },
89
+ {
90
+ "name": "crystal_warmup",
91
+ "epochs": 2,
92
+ "lambda_route": 0.2,
93
+ "lambda_geom": 0.3,
94
+ "gate_tau": 0.08,
95
+ "punct_alpha": 0.7,
96
+ "hard_mask_gate": false,
97
+ "mix_sdpa": 1.0
98
+ },
99
+ {
100
+ "name": "dictionary_crystals",
101
+ "epochs": 2,
102
+ "lambda_route": 0.25,
103
+ "lambda_geom": 0.35,
104
+ "gate_tau": 0.06,
105
+ "punct_alpha": 0.85,
106
+ "hard_mask_gate": false,
107
+ "mix_sdpa": 0.9
108
+ },
109
+ {
110
+ "name": "stability_tuning",
111
+ "epochs": 3,
112
+ "lambda_route": 0.3,
113
+ "lambda_geom": 0.4,
114
+ "gate_tau": 0.05,
115
+ "punct_alpha": 1.0,
116
+ "hard_mask_gate": false,
117
+ "mix_sdpa": 0.8
118
+ }
119
+ ],
120
+ "corpus": [
121
+ {
122
+ "name": "TinyStories",
123
+ "path": "roneneldan/TinyStories",
124
+ "split": "train[30%:50%]",
125
+ "weight": 0.1,
126
+ "dialect": [
127
+ 0.6,
128
+ 0.1,
129
+ 0.05,
130
+ 0.05,
131
+ 0.2
132
+ ]
133
+ },
134
+ {
135
+ "name": "WikipediaEN",
136
+ "path": "wikimedia/wikipedia",
137
+ "config": "20231101.en",
138
+ "split": "train[5%:15%]",
139
+ "weight": 0.5,
140
+ "dialect": [
141
+ 0.12,
142
+ 0.58,
143
+ 0.1,
144
+ 0.1,
145
+ 0.1
146
+ ]
147
+ },
148
+ {
149
+ "name": "AGNews",
150
+ "path": "ag_news",
151
+ "split": "train[:]",
152
+ "weight": 0.1,
153
+ "dialect": [
154
+ 0.2,
155
+ 0.5,
156
+ 0.1,
157
+ 0.1,
158
+ 0.1
159
+ ]
160
+ },
161
+ {
162
+ "name": "GSM8K",
163
+ "path": "openai/gsm8k",
164
+ "config": "main",
165
+ "split": "train[40%:60%]",
166
+ "weight": 0.6,
167
+ "dialect": [
168
+ 0.1,
169
+ 0.15,
170
+ 0.5,
171
+ 0.15,
172
+ 0.1
173
+ ]
174
+ },
175
+ {
176
+ "name": "AI2-ARC-Easy",
177
+ "path": "allenai/ai2_arc",
178
+ "config": "ARC-Easy",
179
+ "split": "train[30%:60%]",
180
+ "weight": 0.6,
181
+ "dialect": [
182
+ 0.05,
183
+ 0.15,
184
+ 0.4,
185
+ 0.25,
186
+ 0.15
187
+ ]
188
+ },
189
+ {
190
+ "name": "HH-RLHF",
191
+ "path": "Anthropic/hh-rlhf",
192
+ "split": "train[5%:10%]",
193
+ "weight": 0.5,
194
+ "dialect": [
195
+ 0.1,
196
+ 0.25,
197
+ 0.2,
198
+ 0.25,
199
+ 0.2
200
+ ]
201
+ },
202
+ {
203
+ "name": "SVAMP",
204
+ "path": "ChilleD/SVAMP",
205
+ "split": "train",
206
+ "weight": 0.25,
207
+ "dialect": [
208
+ 0.1,
209
+ 0.15,
210
+ 0.55,
211
+ 0.15,
212
+ 0.05
213
+ ]
214
+ },
215
+ {
216
+ "name": "MATH-500",
217
+ "path": "HuggingFaceH4/MATH-500",
218
+ "split": "test",
219
+ "weight": 0.25,
220
+ "dialect": [
221
+ 0.05,
222
+ 0.15,
223
+ 0.6,
224
+ 0.15,
225
+ 0.05
226
+ ]
227
+ },
228
+ {
229
+ "name": "SEP",
230
+ "path": "AiresPucrs/stanford-encyclopedia-philosophy",
231
+ "split": "train",
232
+ "weight": 0.3,
233
+ "dialect": [
234
+ 0.05,
235
+ 0.45,
236
+ 0.18,
237
+ 0.22,
238
+ 0.1
239
+ ]
240
+ }
241
+ ],
242
+ "_alive_entries": [
243
+ {
244
+ "name": "TinyStories",
245
+ "path": "roneneldan/TinyStories",
246
+ "split": "train[30%:50%]",
247
+ "weight": 0.1,
248
+ "dialect": [
249
+ 0.6000000238418579,
250
+ 0.10000000149011612,
251
+ 0.05000000074505806,
252
+ 0.05000000074505806,
253
+ 0.20000000298023224
254
+ ],
255
+ "class_id": 0,
256
+ "p": 0.03125000000000001
257
+ },
258
+ {
259
+ "name": "WikipediaEN",
260
+ "path": "wikimedia/wikipedia",
261
+ "config": "20231101.en",
262
+ "split": "train[5%:15%]",
263
+ "weight": 0.5,
264
+ "dialect": [
265
+ 0.11999999731779099,
266
+ 0.5799999833106995,
267
+ 0.10000000149011612,
268
+ 0.10000000149011612,
269
+ 0.10000000149011612
270
+ ],
271
+ "class_id": 1,
272
+ "p": 0.15625
273
+ },
274
+ {
275
+ "name": "AGNews",
276
+ "path": "ag_news",
277
+ "split": "train[:]",
278
+ "weight": 0.1,
279
+ "dialect": [
280
+ 0.20000000298023224,
281
+ 0.5,
282
+ 0.10000000149011612,
283
+ 0.10000000149011612,
284
+ 0.10000000149011612
285
+ ],
286
+ "class_id": 2,
287
+ "p": 0.03125000000000001
288
+ },
289
+ {
290
+ "name": "GSM8K",
291
+ "path": "openai/gsm8k",
292
+ "config": "main",
293
+ "split": "train[40%:60%]",
294
+ "weight": 0.6,
295
+ "dialect": [
296
+ 0.10000000149011612,
297
+ 0.15000000596046448,
298
+ 0.5,
299
+ 0.15000000596046448,
300
+ 0.10000000149011612
301
+ ],
302
+ "class_id": 3,
303
+ "p": 0.1875
304
+ },
305
+ {
306
+ "name": "AI2-ARC-Easy",
307
+ "path": "allenai/ai2_arc",
308
+ "config": "ARC-Easy",
309
+ "split": "train[30%:60%]",
310
+ "weight": 0.6,
311
+ "dialect": [
312
+ 0.05000000074505806,
313
+ 0.15000000596046448,
314
+ 0.4000000059604645,
315
+ 0.25,
316
+ 0.15000000596046448
317
+ ],
318
+ "class_id": 4,
319
+ "p": 0.1875
320
+ },
321
+ {
322
+ "name": "HH-RLHF",
323
+ "path": "Anthropic/hh-rlhf",
324
+ "split": "train[5%:10%]",
325
+ "weight": 0.5,
326
+ "dialect": [
327
+ 0.10000000149011612,
328
+ 0.25,
329
+ 0.20000000298023224,
330
+ 0.25,
331
+ 0.20000000298023224
332
+ ],
333
+ "class_id": 5,
334
+ "p": 0.15625
335
+ },
336
+ {
337
+ "name": "SVAMP",
338
+ "path": "ChilleD/SVAMP",
339
+ "split": "train",
340
+ "weight": 0.25,
341
+ "dialect": [
342
+ 0.10000000149011612,
343
+ 0.15000000596046448,
344
+ 0.550000011920929,
345
+ 0.15000000596046448,
346
+ 0.05000000074505806
347
+ ],
348
+ "class_id": 6,
349
+ "p": 0.078125
350
+ },
351
+ {
352
+ "name": "MATH-500",
353
+ "path": "HuggingFaceH4/MATH-500",
354
+ "split": "test",
355
+ "weight": 0.25,
356
+ "dialect": [
357
+ 0.05000000074505806,
358
+ 0.15000000596046448,
359
+ 0.6000000238418579,
360
+ 0.15000000596046448,
361
+ 0.05000000074505806
362
+ ],
363
+ "class_id": 7,
364
+ "p": 0.078125
365
+ },
366
+ {
367
+ "name": "SEP",
368
+ "path": "AiresPucrs/stanford-encyclopedia-philosophy",
369
+ "split": "train",
370
+ "weight": 0.3,
371
+ "dialect": [
372
+ 0.05000000074505806,
373
+ 0.44999998807907104,
374
+ 0.18000000715255737,
375
+ 0.2199999988079071,
376
+ 0.10000000149011612
377
+ ],
378
+ "class_id": 8,
379
+ "p": 0.09375
380
+ }
381
+ ]
382
+ }