imagick commited on
Commit
54c5f1b
·
verified ·
1 Parent(s): 0fc7c75

Upload folder using huggingface_hub

Browse files
chat_template.jinja ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- for message in messages -%}
2
+ {%- if loop.first and messages[0]['role'] != 'system' -%}
3
+ {{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}
4
+ {%- endif -%}
5
+ {%- if message['role'] == 'system' -%}
6
+ {{'<|im_system|>'}}
7
+ {%- endif -%}
8
+ {%- if message['role'] == 'user' -%}
9
+ {{'<|im_user|>'}}
10
+ {%- endif -%}
11
+ {%- if message['role'] == 'assistant' -%}
12
+ {{'<|im_assistant|>'}}
13
+ {%- endif -%}
14
+ {{- message['role'] -}}
15
+ {{'<|im_middle|>'}}
16
+ {%- if message['content'] is string -%}
17
+ {{- message['content'] + '<|im_end|>' -}}
18
+ {%- else -%}
19
+ {%- for content in message['content'] -%}
20
+ {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
21
+ {{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}
22
+ {%- else -%}
23
+ {{content['text']}}
24
+ {%- endif -%}
25
+ {%- endfor -%}
26
+ {{'<|im_end|>'}}
27
+ {%- endif -%}
28
+ {%- endfor -%}
29
+ {%- if add_generation_prompt -%}
30
+ {{'<|im_assistant|>assistant<|im_middle|>'}}
31
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,1458 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "KimiVLForConditionalGeneration"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_kimi_vl.KimiVLConfig",
8
+ "AutoModel": "modeling_kimi_vl.KimiVLForConditionalGeneration",
9
+ "AutoModelForCausalLM": "modeling_kimi_vl.KimiVLForConditionalGeneration"
10
+ },
11
+ "bad_words_ids": null,
12
+ "begin_suppress_tokens": null,
13
+ "bos_token_id": null,
14
+ "chunk_size_feed_forward": 0,
15
+ "cross_attention_hidden_size": null,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "early_stopping": false,
20
+ "encoder_no_repeat_ngram_size": 0,
21
+ "eos_token_id": null,
22
+ "exponential_decay_length_penalty": null,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "id2label": {
27
+ "0": "LABEL_0",
28
+ "1": "LABEL_1"
29
+ },
30
+ "ignore_index": -100,
31
+ "is_decoder": false,
32
+ "is_encoder_decoder": false,
33
+ "label2id": {
34
+ "LABEL_0": 0,
35
+ "LABEL_1": 1
36
+ },
37
+ "length_penalty": 1.0,
38
+ "max_length": 20,
39
+ "media_placeholder_token_id": 163605,
40
+ "min_length": 0,
41
+ "model_type": "kimi_vl",
42
+ "no_repeat_ngram_size": 0,
43
+ "num_beam_groups": 1,
44
+ "num_beams": 1,
45
+ "num_return_sequences": 1,
46
+ "output_attentions": false,
47
+ "output_hidden_states": false,
48
+ "output_scores": false,
49
+ "pad_token_id": 0,
50
+ "prefix": null,
51
+ "problem_type": null,
52
+ "pruned_heads": {},
53
+ "quantization": {
54
+ "group_size": 64,
55
+ "bits": 8,
56
+ "vision_tower.patch_embed.proj": false,
57
+ "vision_tower.patch_embed.pos_emb": false,
58
+ "vision_tower.rotary_pos_emb": false,
59
+ "vision_tower.blocks.0.norm0": false,
60
+ "vision_tower.blocks.0.norm1": false,
61
+ "vision_tower.blocks.0.attn.wqkv": false,
62
+ "vision_tower.blocks.0.attn.wo": false,
63
+ "vision_tower.blocks.0.mlp.activation_fn": false,
64
+ "vision_tower.blocks.0.mlp.fc0": false,
65
+ "vision_tower.blocks.0.mlp.fc1": false,
66
+ "vision_tower.blocks.1.norm0": false,
67
+ "vision_tower.blocks.1.norm1": false,
68
+ "vision_tower.blocks.1.attn.wqkv": false,
69
+ "vision_tower.blocks.1.attn.wo": false,
70
+ "vision_tower.blocks.1.mlp.activation_fn": false,
71
+ "vision_tower.blocks.1.mlp.fc0": false,
72
+ "vision_tower.blocks.1.mlp.fc1": false,
73
+ "vision_tower.blocks.2.norm0": false,
74
+ "vision_tower.blocks.2.norm1": false,
75
+ "vision_tower.blocks.2.attn.wqkv": false,
76
+ "vision_tower.blocks.2.attn.wo": false,
77
+ "vision_tower.blocks.2.mlp.activation_fn": false,
78
+ "vision_tower.blocks.2.mlp.fc0": false,
79
+ "vision_tower.blocks.2.mlp.fc1": false,
80
+ "vision_tower.blocks.3.norm0": false,
81
+ "vision_tower.blocks.3.norm1": false,
82
+ "vision_tower.blocks.3.attn.wqkv": false,
83
+ "vision_tower.blocks.3.attn.wo": false,
84
+ "vision_tower.blocks.3.mlp.activation_fn": false,
85
+ "vision_tower.blocks.3.mlp.fc0": false,
86
+ "vision_tower.blocks.3.mlp.fc1": false,
87
+ "vision_tower.blocks.4.norm0": false,
88
+ "vision_tower.blocks.4.norm1": false,
89
+ "vision_tower.blocks.4.attn.wqkv": false,
90
+ "vision_tower.blocks.4.attn.wo": false,
91
+ "vision_tower.blocks.4.mlp.activation_fn": false,
92
+ "vision_tower.blocks.4.mlp.fc0": false,
93
+ "vision_tower.blocks.4.mlp.fc1": false,
94
+ "vision_tower.blocks.5.norm0": false,
95
+ "vision_tower.blocks.5.norm1": false,
96
+ "vision_tower.blocks.5.attn.wqkv": false,
97
+ "vision_tower.blocks.5.attn.wo": false,
98
+ "vision_tower.blocks.5.mlp.activation_fn": false,
99
+ "vision_tower.blocks.5.mlp.fc0": false,
100
+ "vision_tower.blocks.5.mlp.fc1": false,
101
+ "vision_tower.blocks.6.norm0": false,
102
+ "vision_tower.blocks.6.norm1": false,
103
+ "vision_tower.blocks.6.attn.wqkv": false,
104
+ "vision_tower.blocks.6.attn.wo": false,
105
+ "vision_tower.blocks.6.mlp.activation_fn": false,
106
+ "vision_tower.blocks.6.mlp.fc0": false,
107
+ "vision_tower.blocks.6.mlp.fc1": false,
108
+ "vision_tower.blocks.7.norm0": false,
109
+ "vision_tower.blocks.7.norm1": false,
110
+ "vision_tower.blocks.7.attn.wqkv": false,
111
+ "vision_tower.blocks.7.attn.wo": false,
112
+ "vision_tower.blocks.7.mlp.activation_fn": false,
113
+ "vision_tower.blocks.7.mlp.fc0": false,
114
+ "vision_tower.blocks.7.mlp.fc1": false,
115
+ "vision_tower.blocks.8.norm0": false,
116
+ "vision_tower.blocks.8.norm1": false,
117
+ "vision_tower.blocks.8.attn.wqkv": false,
118
+ "vision_tower.blocks.8.attn.wo": false,
119
+ "vision_tower.blocks.8.mlp.activation_fn": false,
120
+ "vision_tower.blocks.8.mlp.fc0": false,
121
+ "vision_tower.blocks.8.mlp.fc1": false,
122
+ "vision_tower.blocks.9.norm0": false,
123
+ "vision_tower.blocks.9.norm1": false,
124
+ "vision_tower.blocks.9.attn.wqkv": false,
125
+ "vision_tower.blocks.9.attn.wo": false,
126
+ "vision_tower.blocks.9.mlp.activation_fn": false,
127
+ "vision_tower.blocks.9.mlp.fc0": false,
128
+ "vision_tower.blocks.9.mlp.fc1": false,
129
+ "vision_tower.blocks.10.norm0": false,
130
+ "vision_tower.blocks.10.norm1": false,
131
+ "vision_tower.blocks.10.attn.wqkv": false,
132
+ "vision_tower.blocks.10.attn.wo": false,
133
+ "vision_tower.blocks.10.mlp.activation_fn": false,
134
+ "vision_tower.blocks.10.mlp.fc0": false,
135
+ "vision_tower.blocks.10.mlp.fc1": false,
136
+ "vision_tower.blocks.11.norm0": false,
137
+ "vision_tower.blocks.11.norm1": false,
138
+ "vision_tower.blocks.11.attn.wqkv": false,
139
+ "vision_tower.blocks.11.attn.wo": false,
140
+ "vision_tower.blocks.11.mlp.activation_fn": false,
141
+ "vision_tower.blocks.11.mlp.fc0": false,
142
+ "vision_tower.blocks.11.mlp.fc1": false,
143
+ "vision_tower.blocks.12.norm0": false,
144
+ "vision_tower.blocks.12.norm1": false,
145
+ "vision_tower.blocks.12.attn.wqkv": false,
146
+ "vision_tower.blocks.12.attn.wo": false,
147
+ "vision_tower.blocks.12.mlp.activation_fn": false,
148
+ "vision_tower.blocks.12.mlp.fc0": false,
149
+ "vision_tower.blocks.12.mlp.fc1": false,
150
+ "vision_tower.blocks.13.norm0": false,
151
+ "vision_tower.blocks.13.norm1": false,
152
+ "vision_tower.blocks.13.attn.wqkv": false,
153
+ "vision_tower.blocks.13.attn.wo": false,
154
+ "vision_tower.blocks.13.mlp.activation_fn": false,
155
+ "vision_tower.blocks.13.mlp.fc0": false,
156
+ "vision_tower.blocks.13.mlp.fc1": false,
157
+ "vision_tower.blocks.14.norm0": false,
158
+ "vision_tower.blocks.14.norm1": false,
159
+ "vision_tower.blocks.14.attn.wqkv": false,
160
+ "vision_tower.blocks.14.attn.wo": false,
161
+ "vision_tower.blocks.14.mlp.activation_fn": false,
162
+ "vision_tower.blocks.14.mlp.fc0": false,
163
+ "vision_tower.blocks.14.mlp.fc1": false,
164
+ "vision_tower.blocks.15.norm0": false,
165
+ "vision_tower.blocks.15.norm1": false,
166
+ "vision_tower.blocks.15.attn.wqkv": false,
167
+ "vision_tower.blocks.15.attn.wo": false,
168
+ "vision_tower.blocks.15.mlp.activation_fn": false,
169
+ "vision_tower.blocks.15.mlp.fc0": false,
170
+ "vision_tower.blocks.15.mlp.fc1": false,
171
+ "vision_tower.blocks.16.norm0": false,
172
+ "vision_tower.blocks.16.norm1": false,
173
+ "vision_tower.blocks.16.attn.wqkv": false,
174
+ "vision_tower.blocks.16.attn.wo": false,
175
+ "vision_tower.blocks.16.mlp.activation_fn": false,
176
+ "vision_tower.blocks.16.mlp.fc0": false,
177
+ "vision_tower.blocks.16.mlp.fc1": false,
178
+ "vision_tower.blocks.17.norm0": false,
179
+ "vision_tower.blocks.17.norm1": false,
180
+ "vision_tower.blocks.17.attn.wqkv": false,
181
+ "vision_tower.blocks.17.attn.wo": false,
182
+ "vision_tower.blocks.17.mlp.activation_fn": false,
183
+ "vision_tower.blocks.17.mlp.fc0": false,
184
+ "vision_tower.blocks.17.mlp.fc1": false,
185
+ "vision_tower.blocks.18.norm0": false,
186
+ "vision_tower.blocks.18.norm1": false,
187
+ "vision_tower.blocks.18.attn.wqkv": false,
188
+ "vision_tower.blocks.18.attn.wo": false,
189
+ "vision_tower.blocks.18.mlp.activation_fn": false,
190
+ "vision_tower.blocks.18.mlp.fc0": false,
191
+ "vision_tower.blocks.18.mlp.fc1": false,
192
+ "vision_tower.blocks.19.norm0": false,
193
+ "vision_tower.blocks.19.norm1": false,
194
+ "vision_tower.blocks.19.attn.wqkv": false,
195
+ "vision_tower.blocks.19.attn.wo": false,
196
+ "vision_tower.blocks.19.mlp.activation_fn": false,
197
+ "vision_tower.blocks.19.mlp.fc0": false,
198
+ "vision_tower.blocks.19.mlp.fc1": false,
199
+ "vision_tower.blocks.20.norm0": false,
200
+ "vision_tower.blocks.20.norm1": false,
201
+ "vision_tower.blocks.20.attn.wqkv": false,
202
+ "vision_tower.blocks.20.attn.wo": false,
203
+ "vision_tower.blocks.20.mlp.activation_fn": false,
204
+ "vision_tower.blocks.20.mlp.fc0": false,
205
+ "vision_tower.blocks.20.mlp.fc1": false,
206
+ "vision_tower.blocks.21.norm0": false,
207
+ "vision_tower.blocks.21.norm1": false,
208
+ "vision_tower.blocks.21.attn.wqkv": false,
209
+ "vision_tower.blocks.21.attn.wo": false,
210
+ "vision_tower.blocks.21.mlp.activation_fn": false,
211
+ "vision_tower.blocks.21.mlp.fc0": false,
212
+ "vision_tower.blocks.21.mlp.fc1": false,
213
+ "vision_tower.blocks.22.norm0": false,
214
+ "vision_tower.blocks.22.norm1": false,
215
+ "vision_tower.blocks.22.attn.wqkv": false,
216
+ "vision_tower.blocks.22.attn.wo": false,
217
+ "vision_tower.blocks.22.mlp.activation_fn": false,
218
+ "vision_tower.blocks.22.mlp.fc0": false,
219
+ "vision_tower.blocks.22.mlp.fc1": false,
220
+ "vision_tower.blocks.23.norm0": false,
221
+ "vision_tower.blocks.23.norm1": false,
222
+ "vision_tower.blocks.23.attn.wqkv": false,
223
+ "vision_tower.blocks.23.attn.wo": false,
224
+ "vision_tower.blocks.23.mlp.activation_fn": false,
225
+ "vision_tower.blocks.23.mlp.fc0": false,
226
+ "vision_tower.blocks.23.mlp.fc1": false,
227
+ "vision_tower.blocks.24.norm0": false,
228
+ "vision_tower.blocks.24.norm1": false,
229
+ "vision_tower.blocks.24.attn.wqkv": false,
230
+ "vision_tower.blocks.24.attn.wo": false,
231
+ "vision_tower.blocks.24.mlp.activation_fn": false,
232
+ "vision_tower.blocks.24.mlp.fc0": false,
233
+ "vision_tower.blocks.24.mlp.fc1": false,
234
+ "vision_tower.blocks.25.norm0": false,
235
+ "vision_tower.blocks.25.norm1": false,
236
+ "vision_tower.blocks.25.attn.wqkv": false,
237
+ "vision_tower.blocks.25.attn.wo": false,
238
+ "vision_tower.blocks.25.mlp.activation_fn": false,
239
+ "vision_tower.blocks.25.mlp.fc0": false,
240
+ "vision_tower.blocks.25.mlp.fc1": false,
241
+ "vision_tower.blocks.26.norm0": false,
242
+ "vision_tower.blocks.26.norm1": false,
243
+ "vision_tower.blocks.26.attn.wqkv": false,
244
+ "vision_tower.blocks.26.attn.wo": false,
245
+ "vision_tower.blocks.26.mlp.activation_fn": false,
246
+ "vision_tower.blocks.26.mlp.fc0": false,
247
+ "vision_tower.blocks.26.mlp.fc1": false,
248
+ "vision_tower.final_layernorm": false,
249
+ "vision_tower.rope_pos_emb": false,
250
+ "language_model.model.embed_tokens": true,
251
+ "language_model.model.layers.0.self_attn.q_proj": true,
252
+ "language_model.model.layers.0.self_attn.kv_a_proj_with_mqa": true,
253
+ "language_model.model.layers.0.self_attn.kv_a_layernorm": false,
254
+ "language_model.model.layers.0.self_attn.kv_b_proj": true,
255
+ "language_model.model.layers.0.self_attn.o_proj": true,
256
+ "language_model.model.layers.0.self_attn.rope": false,
257
+ "language_model.model.layers.0.mlp.gate_proj": true,
258
+ "language_model.model.layers.0.mlp.up_proj": true,
259
+ "language_model.model.layers.0.mlp.down_proj": true,
260
+ "language_model.model.layers.0.input_layernorm": false,
261
+ "language_model.model.layers.0.post_attention_layernorm": false,
262
+ "language_model.model.layers.1.self_attn.q_proj": true,
263
+ "language_model.model.layers.1.self_attn.kv_a_proj_with_mqa": true,
264
+ "language_model.model.layers.1.self_attn.kv_a_layernorm": false,
265
+ "language_model.model.layers.1.self_attn.kv_b_proj": true,
266
+ "language_model.model.layers.1.self_attn.o_proj": true,
267
+ "language_model.model.layers.1.self_attn.rope": false,
268
+ "language_model.model.layers.1.mlp.switch_mlp.gate_proj": true,
269
+ "language_model.model.layers.1.mlp.switch_mlp.up_proj": true,
270
+ "language_model.model.layers.1.mlp.switch_mlp.down_proj": true,
271
+ "language_model.model.layers.1.mlp.gate": false,
272
+ "language_model.model.layers.1.mlp.shared_experts.gate_proj": true,
273
+ "language_model.model.layers.1.mlp.shared_experts.up_proj": true,
274
+ "language_model.model.layers.1.mlp.shared_experts.down_proj": true,
275
+ "language_model.model.layers.1.input_layernorm": false,
276
+ "language_model.model.layers.1.post_attention_layernorm": false,
277
+ "language_model.model.layers.2.self_attn.q_proj": true,
278
+ "language_model.model.layers.2.self_attn.kv_a_proj_with_mqa": true,
279
+ "language_model.model.layers.2.self_attn.kv_a_layernorm": false,
280
+ "language_model.model.layers.2.self_attn.kv_b_proj": true,
281
+ "language_model.model.layers.2.self_attn.o_proj": true,
282
+ "language_model.model.layers.2.self_attn.rope": false,
283
+ "language_model.model.layers.2.mlp.switch_mlp.gate_proj": true,
284
+ "language_model.model.layers.2.mlp.switch_mlp.up_proj": true,
285
+ "language_model.model.layers.2.mlp.switch_mlp.down_proj": true,
286
+ "language_model.model.layers.2.mlp.gate": false,
287
+ "language_model.model.layers.2.mlp.shared_experts.gate_proj": true,
288
+ "language_model.model.layers.2.mlp.shared_experts.up_proj": true,
289
+ "language_model.model.layers.2.mlp.shared_experts.down_proj": true,
290
+ "language_model.model.layers.2.input_layernorm": false,
291
+ "language_model.model.layers.2.post_attention_layernorm": false,
292
+ "language_model.model.layers.3.self_attn.q_proj": true,
293
+ "language_model.model.layers.3.self_attn.kv_a_proj_with_mqa": true,
294
+ "language_model.model.layers.3.self_attn.kv_a_layernorm": false,
295
+ "language_model.model.layers.3.self_attn.kv_b_proj": true,
296
+ "language_model.model.layers.3.self_attn.o_proj": true,
297
+ "language_model.model.layers.3.self_attn.rope": false,
298
+ "language_model.model.layers.3.mlp.switch_mlp.gate_proj": true,
299
+ "language_model.model.layers.3.mlp.switch_mlp.up_proj": true,
300
+ "language_model.model.layers.3.mlp.switch_mlp.down_proj": true,
301
+ "language_model.model.layers.3.mlp.gate": false,
302
+ "language_model.model.layers.3.mlp.shared_experts.gate_proj": true,
303
+ "language_model.model.layers.3.mlp.shared_experts.up_proj": true,
304
+ "language_model.model.layers.3.mlp.shared_experts.down_proj": true,
305
+ "language_model.model.layers.3.input_layernorm": false,
306
+ "language_model.model.layers.3.post_attention_layernorm": false,
307
+ "language_model.model.layers.4.self_attn.q_proj": true,
308
+ "language_model.model.layers.4.self_attn.kv_a_proj_with_mqa": true,
309
+ "language_model.model.layers.4.self_attn.kv_a_layernorm": false,
310
+ "language_model.model.layers.4.self_attn.kv_b_proj": true,
311
+ "language_model.model.layers.4.self_attn.o_proj": true,
312
+ "language_model.model.layers.4.self_attn.rope": false,
313
+ "language_model.model.layers.4.mlp.switch_mlp.gate_proj": true,
314
+ "language_model.model.layers.4.mlp.switch_mlp.up_proj": true,
315
+ "language_model.model.layers.4.mlp.switch_mlp.down_proj": true,
316
+ "language_model.model.layers.4.mlp.gate": false,
317
+ "language_model.model.layers.4.mlp.shared_experts.gate_proj": true,
318
+ "language_model.model.layers.4.mlp.shared_experts.up_proj": true,
319
+ "language_model.model.layers.4.mlp.shared_experts.down_proj": true,
320
+ "language_model.model.layers.4.input_layernorm": false,
321
+ "language_model.model.layers.4.post_attention_layernorm": false,
322
+ "language_model.model.layers.5.self_attn.q_proj": true,
323
+ "language_model.model.layers.5.self_attn.kv_a_proj_with_mqa": true,
324
+ "language_model.model.layers.5.self_attn.kv_a_layernorm": false,
325
+ "language_model.model.layers.5.self_attn.kv_b_proj": true,
326
+ "language_model.model.layers.5.self_attn.o_proj": true,
327
+ "language_model.model.layers.5.self_attn.rope": false,
328
+ "language_model.model.layers.5.mlp.switch_mlp.gate_proj": true,
329
+ "language_model.model.layers.5.mlp.switch_mlp.up_proj": true,
330
+ "language_model.model.layers.5.mlp.switch_mlp.down_proj": true,
331
+ "language_model.model.layers.5.mlp.gate": false,
332
+ "language_model.model.layers.5.mlp.shared_experts.gate_proj": true,
333
+ "language_model.model.layers.5.mlp.shared_experts.up_proj": true,
334
+ "language_model.model.layers.5.mlp.shared_experts.down_proj": true,
335
+ "language_model.model.layers.5.input_layernorm": false,
336
+ "language_model.model.layers.5.post_attention_layernorm": false,
337
+ "language_model.model.layers.6.self_attn.q_proj": true,
338
+ "language_model.model.layers.6.self_attn.kv_a_proj_with_mqa": true,
339
+ "language_model.model.layers.6.self_attn.kv_a_layernorm": false,
340
+ "language_model.model.layers.6.self_attn.kv_b_proj": true,
341
+ "language_model.model.layers.6.self_attn.o_proj": true,
342
+ "language_model.model.layers.6.self_attn.rope": false,
343
+ "language_model.model.layers.6.mlp.switch_mlp.gate_proj": true,
344
+ "language_model.model.layers.6.mlp.switch_mlp.up_proj": true,
345
+ "language_model.model.layers.6.mlp.switch_mlp.down_proj": true,
346
+ "language_model.model.layers.6.mlp.gate": false,
347
+ "language_model.model.layers.6.mlp.shared_experts.gate_proj": true,
348
+ "language_model.model.layers.6.mlp.shared_experts.up_proj": true,
349
+ "language_model.model.layers.6.mlp.shared_experts.down_proj": true,
350
+ "language_model.model.layers.6.input_layernorm": false,
351
+ "language_model.model.layers.6.post_attention_layernorm": false,
352
+ "language_model.model.layers.7.self_attn.q_proj": true,
353
+ "language_model.model.layers.7.self_attn.kv_a_proj_with_mqa": true,
354
+ "language_model.model.layers.7.self_attn.kv_a_layernorm": false,
355
+ "language_model.model.layers.7.self_attn.kv_b_proj": true,
356
+ "language_model.model.layers.7.self_attn.o_proj": true,
357
+ "language_model.model.layers.7.self_attn.rope": false,
358
+ "language_model.model.layers.7.mlp.switch_mlp.gate_proj": true,
359
+ "language_model.model.layers.7.mlp.switch_mlp.up_proj": true,
360
+ "language_model.model.layers.7.mlp.switch_mlp.down_proj": true,
361
+ "language_model.model.layers.7.mlp.gate": false,
362
+ "language_model.model.layers.7.mlp.shared_experts.gate_proj": true,
363
+ "language_model.model.layers.7.mlp.shared_experts.up_proj": true,
364
+ "language_model.model.layers.7.mlp.shared_experts.down_proj": true,
365
+ "language_model.model.layers.7.input_layernorm": false,
366
+ "language_model.model.layers.7.post_attention_layernorm": false,
367
+ "language_model.model.layers.8.self_attn.q_proj": true,
368
+ "language_model.model.layers.8.self_attn.kv_a_proj_with_mqa": true,
369
+ "language_model.model.layers.8.self_attn.kv_a_layernorm": false,
370
+ "language_model.model.layers.8.self_attn.kv_b_proj": true,
371
+ "language_model.model.layers.8.self_attn.o_proj": true,
372
+ "language_model.model.layers.8.self_attn.rope": false,
373
+ "language_model.model.layers.8.mlp.switch_mlp.gate_proj": true,
374
+ "language_model.model.layers.8.mlp.switch_mlp.up_proj": true,
375
+ "language_model.model.layers.8.mlp.switch_mlp.down_proj": true,
376
+ "language_model.model.layers.8.mlp.gate": false,
377
+ "language_model.model.layers.8.mlp.shared_experts.gate_proj": true,
378
+ "language_model.model.layers.8.mlp.shared_experts.up_proj": true,
379
+ "language_model.model.layers.8.mlp.shared_experts.down_proj": true,
380
+ "language_model.model.layers.8.input_layernorm": false,
381
+ "language_model.model.layers.8.post_attention_layernorm": false,
382
+ "language_model.model.layers.9.self_attn.q_proj": true,
383
+ "language_model.model.layers.9.self_attn.kv_a_proj_with_mqa": true,
384
+ "language_model.model.layers.9.self_attn.kv_a_layernorm": false,
385
+ "language_model.model.layers.9.self_attn.kv_b_proj": true,
386
+ "language_model.model.layers.9.self_attn.o_proj": true,
387
+ "language_model.model.layers.9.self_attn.rope": false,
388
+ "language_model.model.layers.9.mlp.switch_mlp.gate_proj": true,
389
+ "language_model.model.layers.9.mlp.switch_mlp.up_proj": true,
390
+ "language_model.model.layers.9.mlp.switch_mlp.down_proj": true,
391
+ "language_model.model.layers.9.mlp.gate": false,
392
+ "language_model.model.layers.9.mlp.shared_experts.gate_proj": true,
393
+ "language_model.model.layers.9.mlp.shared_experts.up_proj": true,
394
+ "language_model.model.layers.9.mlp.shared_experts.down_proj": true,
395
+ "language_model.model.layers.9.input_layernorm": false,
396
+ "language_model.model.layers.9.post_attention_layernorm": false,
397
+ "language_model.model.layers.10.self_attn.q_proj": true,
398
+ "language_model.model.layers.10.self_attn.kv_a_proj_with_mqa": true,
399
+ "language_model.model.layers.10.self_attn.kv_a_layernorm": false,
400
+ "language_model.model.layers.10.self_attn.kv_b_proj": true,
401
+ "language_model.model.layers.10.self_attn.o_proj": true,
402
+ "language_model.model.layers.10.self_attn.rope": false,
403
+ "language_model.model.layers.10.mlp.switch_mlp.gate_proj": true,
404
+ "language_model.model.layers.10.mlp.switch_mlp.up_proj": true,
405
+ "language_model.model.layers.10.mlp.switch_mlp.down_proj": true,
406
+ "language_model.model.layers.10.mlp.gate": false,
407
+ "language_model.model.layers.10.mlp.shared_experts.gate_proj": true,
408
+ "language_model.model.layers.10.mlp.shared_experts.up_proj": true,
409
+ "language_model.model.layers.10.mlp.shared_experts.down_proj": true,
410
+ "language_model.model.layers.10.input_layernorm": false,
411
+ "language_model.model.layers.10.post_attention_layernorm": false,
412
+ "language_model.model.layers.11.self_attn.q_proj": true,
413
+ "language_model.model.layers.11.self_attn.kv_a_proj_with_mqa": true,
414
+ "language_model.model.layers.11.self_attn.kv_a_layernorm": false,
415
+ "language_model.model.layers.11.self_attn.kv_b_proj": true,
416
+ "language_model.model.layers.11.self_attn.o_proj": true,
417
+ "language_model.model.layers.11.self_attn.rope": false,
418
+ "language_model.model.layers.11.mlp.switch_mlp.gate_proj": true,
419
+ "language_model.model.layers.11.mlp.switch_mlp.up_proj": true,
420
+ "language_model.model.layers.11.mlp.switch_mlp.down_proj": true,
421
+ "language_model.model.layers.11.mlp.gate": false,
422
+ "language_model.model.layers.11.mlp.shared_experts.gate_proj": true,
423
+ "language_model.model.layers.11.mlp.shared_experts.up_proj": true,
424
+ "language_model.model.layers.11.mlp.shared_experts.down_proj": true,
425
+ "language_model.model.layers.11.input_layernorm": false,
426
+ "language_model.model.layers.11.post_attention_layernorm": false,
427
+ "language_model.model.layers.12.self_attn.q_proj": true,
428
+ "language_model.model.layers.12.self_attn.kv_a_proj_with_mqa": true,
429
+ "language_model.model.layers.12.self_attn.kv_a_layernorm": false,
430
+ "language_model.model.layers.12.self_attn.kv_b_proj": true,
431
+ "language_model.model.layers.12.self_attn.o_proj": true,
432
+ "language_model.model.layers.12.self_attn.rope": false,
433
+ "language_model.model.layers.12.mlp.switch_mlp.gate_proj": true,
434
+ "language_model.model.layers.12.mlp.switch_mlp.up_proj": true,
435
+ "language_model.model.layers.12.mlp.switch_mlp.down_proj": true,
436
+ "language_model.model.layers.12.mlp.gate": false,
437
+ "language_model.model.layers.12.mlp.shared_experts.gate_proj": true,
438
+ "language_model.model.layers.12.mlp.shared_experts.up_proj": true,
439
+ "language_model.model.layers.12.mlp.shared_experts.down_proj": true,
440
+ "language_model.model.layers.12.input_layernorm": false,
441
+ "language_model.model.layers.12.post_attention_layernorm": false,
442
+ "language_model.model.layers.13.self_attn.q_proj": true,
443
+ "language_model.model.layers.13.self_attn.kv_a_proj_with_mqa": true,
444
+ "language_model.model.layers.13.self_attn.kv_a_layernorm": false,
445
+ "language_model.model.layers.13.self_attn.kv_b_proj": true,
446
+ "language_model.model.layers.13.self_attn.o_proj": true,
447
+ "language_model.model.layers.13.self_attn.rope": false,
448
+ "language_model.model.layers.13.mlp.switch_mlp.gate_proj": true,
449
+ "language_model.model.layers.13.mlp.switch_mlp.up_proj": true,
450
+ "language_model.model.layers.13.mlp.switch_mlp.down_proj": true,
451
+ "language_model.model.layers.13.mlp.gate": false,
452
+ "language_model.model.layers.13.mlp.shared_experts.gate_proj": true,
453
+ "language_model.model.layers.13.mlp.shared_experts.up_proj": true,
454
+ "language_model.model.layers.13.mlp.shared_experts.down_proj": true,
455
+ "language_model.model.layers.13.input_layernorm": false,
456
+ "language_model.model.layers.13.post_attention_layernorm": false,
457
+ "language_model.model.layers.14.self_attn.q_proj": true,
458
+ "language_model.model.layers.14.self_attn.kv_a_proj_with_mqa": true,
459
+ "language_model.model.layers.14.self_attn.kv_a_layernorm": false,
460
+ "language_model.model.layers.14.self_attn.kv_b_proj": true,
461
+ "language_model.model.layers.14.self_attn.o_proj": true,
462
+ "language_model.model.layers.14.self_attn.rope": false,
463
+ "language_model.model.layers.14.mlp.switch_mlp.gate_proj": true,
464
+ "language_model.model.layers.14.mlp.switch_mlp.up_proj": true,
465
+ "language_model.model.layers.14.mlp.switch_mlp.down_proj": true,
466
+ "language_model.model.layers.14.mlp.gate": false,
467
+ "language_model.model.layers.14.mlp.shared_experts.gate_proj": true,
468
+ "language_model.model.layers.14.mlp.shared_experts.up_proj": true,
469
+ "language_model.model.layers.14.mlp.shared_experts.down_proj": true,
470
+ "language_model.model.layers.14.input_layernorm": false,
471
+ "language_model.model.layers.14.post_attention_layernorm": false,
472
+ "language_model.model.layers.15.self_attn.q_proj": true,
473
+ "language_model.model.layers.15.self_attn.kv_a_proj_with_mqa": true,
474
+ "language_model.model.layers.15.self_attn.kv_a_layernorm": false,
475
+ "language_model.model.layers.15.self_attn.kv_b_proj": true,
476
+ "language_model.model.layers.15.self_attn.o_proj": true,
477
+ "language_model.model.layers.15.self_attn.rope": false,
478
+ "language_model.model.layers.15.mlp.switch_mlp.gate_proj": true,
479
+ "language_model.model.layers.15.mlp.switch_mlp.up_proj": true,
480
+ "language_model.model.layers.15.mlp.switch_mlp.down_proj": true,
481
+ "language_model.model.layers.15.mlp.gate": false,
482
+ "language_model.model.layers.15.mlp.shared_experts.gate_proj": true,
483
+ "language_model.model.layers.15.mlp.shared_experts.up_proj": true,
484
+ "language_model.model.layers.15.mlp.shared_experts.down_proj": true,
485
+ "language_model.model.layers.15.input_layernorm": false,
486
+ "language_model.model.layers.15.post_attention_layernorm": false,
487
+ "language_model.model.layers.16.self_attn.q_proj": true,
488
+ "language_model.model.layers.16.self_attn.kv_a_proj_with_mqa": true,
489
+ "language_model.model.layers.16.self_attn.kv_a_layernorm": false,
490
+ "language_model.model.layers.16.self_attn.kv_b_proj": true,
491
+ "language_model.model.layers.16.self_attn.o_proj": true,
492
+ "language_model.model.layers.16.self_attn.rope": false,
493
+ "language_model.model.layers.16.mlp.switch_mlp.gate_proj": true,
494
+ "language_model.model.layers.16.mlp.switch_mlp.up_proj": true,
495
+ "language_model.model.layers.16.mlp.switch_mlp.down_proj": true,
496
+ "language_model.model.layers.16.mlp.gate": false,
497
+ "language_model.model.layers.16.mlp.shared_experts.gate_proj": true,
498
+ "language_model.model.layers.16.mlp.shared_experts.up_proj": true,
499
+ "language_model.model.layers.16.mlp.shared_experts.down_proj": true,
500
+ "language_model.model.layers.16.input_layernorm": false,
501
+ "language_model.model.layers.16.post_attention_layernorm": false,
502
+ "language_model.model.layers.17.self_attn.q_proj": true,
503
+ "language_model.model.layers.17.self_attn.kv_a_proj_with_mqa": true,
504
+ "language_model.model.layers.17.self_attn.kv_a_layernorm": false,
505
+ "language_model.model.layers.17.self_attn.kv_b_proj": true,
506
+ "language_model.model.layers.17.self_attn.o_proj": true,
507
+ "language_model.model.layers.17.self_attn.rope": false,
508
+ "language_model.model.layers.17.mlp.switch_mlp.gate_proj": true,
509
+ "language_model.model.layers.17.mlp.switch_mlp.up_proj": true,
510
+ "language_model.model.layers.17.mlp.switch_mlp.down_proj": true,
511
+ "language_model.model.layers.17.mlp.gate": false,
512
+ "language_model.model.layers.17.mlp.shared_experts.gate_proj": true,
513
+ "language_model.model.layers.17.mlp.shared_experts.up_proj": true,
514
+ "language_model.model.layers.17.mlp.shared_experts.down_proj": true,
515
+ "language_model.model.layers.17.input_layernorm": false,
516
+ "language_model.model.layers.17.post_attention_layernorm": false,
517
+ "language_model.model.layers.18.self_attn.q_proj": true,
518
+ "language_model.model.layers.18.self_attn.kv_a_proj_with_mqa": true,
519
+ "language_model.model.layers.18.self_attn.kv_a_layernorm": false,
520
+ "language_model.model.layers.18.self_attn.kv_b_proj": true,
521
+ "language_model.model.layers.18.self_attn.o_proj": true,
522
+ "language_model.model.layers.18.self_attn.rope": false,
523
+ "language_model.model.layers.18.mlp.switch_mlp.gate_proj": true,
524
+ "language_model.model.layers.18.mlp.switch_mlp.up_proj": true,
525
+ "language_model.model.layers.18.mlp.switch_mlp.down_proj": true,
526
+ "language_model.model.layers.18.mlp.gate": false,
527
+ "language_model.model.layers.18.mlp.shared_experts.gate_proj": true,
528
+ "language_model.model.layers.18.mlp.shared_experts.up_proj": true,
529
+ "language_model.model.layers.18.mlp.shared_experts.down_proj": true,
530
+ "language_model.model.layers.18.input_layernorm": false,
531
+ "language_model.model.layers.18.post_attention_layernorm": false,
532
+ "language_model.model.layers.19.self_attn.q_proj": true,
533
+ "language_model.model.layers.19.self_attn.kv_a_proj_with_mqa": true,
534
+ "language_model.model.layers.19.self_attn.kv_a_layernorm": false,
535
+ "language_model.model.layers.19.self_attn.kv_b_proj": true,
536
+ "language_model.model.layers.19.self_attn.o_proj": true,
537
+ "language_model.model.layers.19.self_attn.rope": false,
538
+ "language_model.model.layers.19.mlp.switch_mlp.gate_proj": true,
539
+ "language_model.model.layers.19.mlp.switch_mlp.up_proj": true,
540
+ "language_model.model.layers.19.mlp.switch_mlp.down_proj": true,
541
+ "language_model.model.layers.19.mlp.gate": false,
542
+ "language_model.model.layers.19.mlp.shared_experts.gate_proj": true,
543
+ "language_model.model.layers.19.mlp.shared_experts.up_proj": true,
544
+ "language_model.model.layers.19.mlp.shared_experts.down_proj": true,
545
+ "language_model.model.layers.19.input_layernorm": false,
546
+ "language_model.model.layers.19.post_attention_layernorm": false,
547
+ "language_model.model.layers.20.self_attn.q_proj": true,
548
+ "language_model.model.layers.20.self_attn.kv_a_proj_with_mqa": true,
549
+ "language_model.model.layers.20.self_attn.kv_a_layernorm": false,
550
+ "language_model.model.layers.20.self_attn.kv_b_proj": true,
551
+ "language_model.model.layers.20.self_attn.o_proj": true,
552
+ "language_model.model.layers.20.self_attn.rope": false,
553
+ "language_model.model.layers.20.mlp.switch_mlp.gate_proj": true,
554
+ "language_model.model.layers.20.mlp.switch_mlp.up_proj": true,
555
+ "language_model.model.layers.20.mlp.switch_mlp.down_proj": true,
556
+ "language_model.model.layers.20.mlp.gate": false,
557
+ "language_model.model.layers.20.mlp.shared_experts.gate_proj": true,
558
+ "language_model.model.layers.20.mlp.shared_experts.up_proj": true,
559
+ "language_model.model.layers.20.mlp.shared_experts.down_proj": true,
560
+ "language_model.model.layers.20.input_layernorm": false,
561
+ "language_model.model.layers.20.post_attention_layernorm": false,
562
+ "language_model.model.layers.21.self_attn.q_proj": true,
563
+ "language_model.model.layers.21.self_attn.kv_a_proj_with_mqa": true,
564
+ "language_model.model.layers.21.self_attn.kv_a_layernorm": false,
565
+ "language_model.model.layers.21.self_attn.kv_b_proj": true,
566
+ "language_model.model.layers.21.self_attn.o_proj": true,
567
+ "language_model.model.layers.21.self_attn.rope": false,
568
+ "language_model.model.layers.21.mlp.switch_mlp.gate_proj": true,
569
+ "language_model.model.layers.21.mlp.switch_mlp.up_proj": true,
570
+ "language_model.model.layers.21.mlp.switch_mlp.down_proj": true,
571
+ "language_model.model.layers.21.mlp.gate": false,
572
+ "language_model.model.layers.21.mlp.shared_experts.gate_proj": true,
573
+ "language_model.model.layers.21.mlp.shared_experts.up_proj": true,
574
+ "language_model.model.layers.21.mlp.shared_experts.down_proj": true,
575
+ "language_model.model.layers.21.input_layernorm": false,
576
+ "language_model.model.layers.21.post_attention_layernorm": false,
577
+ "language_model.model.layers.22.self_attn.q_proj": true,
578
+ "language_model.model.layers.22.self_attn.kv_a_proj_with_mqa": true,
579
+ "language_model.model.layers.22.self_attn.kv_a_layernorm": false,
580
+ "language_model.model.layers.22.self_attn.kv_b_proj": true,
581
+ "language_model.model.layers.22.self_attn.o_proj": true,
582
+ "language_model.model.layers.22.self_attn.rope": false,
583
+ "language_model.model.layers.22.mlp.switch_mlp.gate_proj": true,
584
+ "language_model.model.layers.22.mlp.switch_mlp.up_proj": true,
585
+ "language_model.model.layers.22.mlp.switch_mlp.down_proj": true,
586
+ "language_model.model.layers.22.mlp.gate": false,
587
+ "language_model.model.layers.22.mlp.shared_experts.gate_proj": true,
588
+ "language_model.model.layers.22.mlp.shared_experts.up_proj": true,
589
+ "language_model.model.layers.22.mlp.shared_experts.down_proj": true,
590
+ "language_model.model.layers.22.input_layernorm": false,
591
+ "language_model.model.layers.22.post_attention_layernorm": false,
592
+ "language_model.model.layers.23.self_attn.q_proj": true,
593
+ "language_model.model.layers.23.self_attn.kv_a_proj_with_mqa": true,
594
+ "language_model.model.layers.23.self_attn.kv_a_layernorm": false,
595
+ "language_model.model.layers.23.self_attn.kv_b_proj": true,
596
+ "language_model.model.layers.23.self_attn.o_proj": true,
597
+ "language_model.model.layers.23.self_attn.rope": false,
598
+ "language_model.model.layers.23.mlp.switch_mlp.gate_proj": true,
599
+ "language_model.model.layers.23.mlp.switch_mlp.up_proj": true,
600
+ "language_model.model.layers.23.mlp.switch_mlp.down_proj": true,
601
+ "language_model.model.layers.23.mlp.gate": false,
602
+ "language_model.model.layers.23.mlp.shared_experts.gate_proj": true,
603
+ "language_model.model.layers.23.mlp.shared_experts.up_proj": true,
604
+ "language_model.model.layers.23.mlp.shared_experts.down_proj": true,
605
+ "language_model.model.layers.23.input_layernorm": false,
606
+ "language_model.model.layers.23.post_attention_layernorm": false,
607
+ "language_model.model.layers.24.self_attn.q_proj": true,
608
+ "language_model.model.layers.24.self_attn.kv_a_proj_with_mqa": true,
609
+ "language_model.model.layers.24.self_attn.kv_a_layernorm": false,
610
+ "language_model.model.layers.24.self_attn.kv_b_proj": true,
611
+ "language_model.model.layers.24.self_attn.o_proj": true,
612
+ "language_model.model.layers.24.self_attn.rope": false,
613
+ "language_model.model.layers.24.mlp.switch_mlp.gate_proj": true,
614
+ "language_model.model.layers.24.mlp.switch_mlp.up_proj": true,
615
+ "language_model.model.layers.24.mlp.switch_mlp.down_proj": true,
616
+ "language_model.model.layers.24.mlp.gate": false,
617
+ "language_model.model.layers.24.mlp.shared_experts.gate_proj": true,
618
+ "language_model.model.layers.24.mlp.shared_experts.up_proj": true,
619
+ "language_model.model.layers.24.mlp.shared_experts.down_proj": true,
620
+ "language_model.model.layers.24.input_layernorm": false,
621
+ "language_model.model.layers.24.post_attention_layernorm": false,
622
+ "language_model.model.layers.25.self_attn.q_proj": true,
623
+ "language_model.model.layers.25.self_attn.kv_a_proj_with_mqa": true,
624
+ "language_model.model.layers.25.self_attn.kv_a_layernorm": false,
625
+ "language_model.model.layers.25.self_attn.kv_b_proj": true,
626
+ "language_model.model.layers.25.self_attn.o_proj": true,
627
+ "language_model.model.layers.25.self_attn.rope": false,
628
+ "language_model.model.layers.25.mlp.switch_mlp.gate_proj": true,
629
+ "language_model.model.layers.25.mlp.switch_mlp.up_proj": true,
630
+ "language_model.model.layers.25.mlp.switch_mlp.down_proj": true,
631
+ "language_model.model.layers.25.mlp.gate": false,
632
+ "language_model.model.layers.25.mlp.shared_experts.gate_proj": true,
633
+ "language_model.model.layers.25.mlp.shared_experts.up_proj": true,
634
+ "language_model.model.layers.25.mlp.shared_experts.down_proj": true,
635
+ "language_model.model.layers.25.input_layernorm": false,
636
+ "language_model.model.layers.25.post_attention_layernorm": false,
637
+ "language_model.model.layers.26.self_attn.q_proj": true,
638
+ "language_model.model.layers.26.self_attn.kv_a_proj_with_mqa": true,
639
+ "language_model.model.layers.26.self_attn.kv_a_layernorm": false,
640
+ "language_model.model.layers.26.self_attn.kv_b_proj": true,
641
+ "language_model.model.layers.26.self_attn.o_proj": true,
642
+ "language_model.model.layers.26.self_attn.rope": false,
643
+ "language_model.model.layers.26.mlp.switch_mlp.gate_proj": true,
644
+ "language_model.model.layers.26.mlp.switch_mlp.up_proj": true,
645
+ "language_model.model.layers.26.mlp.switch_mlp.down_proj": true,
646
+ "language_model.model.layers.26.mlp.gate": false,
647
+ "language_model.model.layers.26.mlp.shared_experts.gate_proj": true,
648
+ "language_model.model.layers.26.mlp.shared_experts.up_proj": true,
649
+ "language_model.model.layers.26.mlp.shared_experts.down_proj": true,
650
+ "language_model.model.layers.26.input_layernorm": false,
651
+ "language_model.model.layers.26.post_attention_layernorm": false,
652
+ "language_model.model.norm": false,
653
+ "language_model.lm_head": true,
654
+ "multi_modal_projector.pre_norm": false,
655
+ "multi_modal_projector.linear_1": true,
656
+ "multi_modal_projector.act": false,
657
+ "multi_modal_projector.linear_2": true
658
+ },
659
+ "quantization_config": {
660
+ "group_size": 64,
661
+ "bits": 8,
662
+ "vision_tower.patch_embed.proj": false,
663
+ "vision_tower.patch_embed.pos_emb": false,
664
+ "vision_tower.rotary_pos_emb": false,
665
+ "vision_tower.blocks.0.norm0": false,
666
+ "vision_tower.blocks.0.norm1": false,
667
+ "vision_tower.blocks.0.attn.wqkv": false,
668
+ "vision_tower.blocks.0.attn.wo": false,
669
+ "vision_tower.blocks.0.mlp.activation_fn": false,
670
+ "vision_tower.blocks.0.mlp.fc0": false,
671
+ "vision_tower.blocks.0.mlp.fc1": false,
672
+ "vision_tower.blocks.1.norm0": false,
673
+ "vision_tower.blocks.1.norm1": false,
674
+ "vision_tower.blocks.1.attn.wqkv": false,
675
+ "vision_tower.blocks.1.attn.wo": false,
676
+ "vision_tower.blocks.1.mlp.activation_fn": false,
677
+ "vision_tower.blocks.1.mlp.fc0": false,
678
+ "vision_tower.blocks.1.mlp.fc1": false,
679
+ "vision_tower.blocks.2.norm0": false,
680
+ "vision_tower.blocks.2.norm1": false,
681
+ "vision_tower.blocks.2.attn.wqkv": false,
682
+ "vision_tower.blocks.2.attn.wo": false,
683
+ "vision_tower.blocks.2.mlp.activation_fn": false,
684
+ "vision_tower.blocks.2.mlp.fc0": false,
685
+ "vision_tower.blocks.2.mlp.fc1": false,
686
+ "vision_tower.blocks.3.norm0": false,
687
+ "vision_tower.blocks.3.norm1": false,
688
+ "vision_tower.blocks.3.attn.wqkv": false,
689
+ "vision_tower.blocks.3.attn.wo": false,
690
+ "vision_tower.blocks.3.mlp.activation_fn": false,
691
+ "vision_tower.blocks.3.mlp.fc0": false,
692
+ "vision_tower.blocks.3.mlp.fc1": false,
693
+ "vision_tower.blocks.4.norm0": false,
694
+ "vision_tower.blocks.4.norm1": false,
695
+ "vision_tower.blocks.4.attn.wqkv": false,
696
+ "vision_tower.blocks.4.attn.wo": false,
697
+ "vision_tower.blocks.4.mlp.activation_fn": false,
698
+ "vision_tower.blocks.4.mlp.fc0": false,
699
+ "vision_tower.blocks.4.mlp.fc1": false,
700
+ "vision_tower.blocks.5.norm0": false,
701
+ "vision_tower.blocks.5.norm1": false,
702
+ "vision_tower.blocks.5.attn.wqkv": false,
703
+ "vision_tower.blocks.5.attn.wo": false,
704
+ "vision_tower.blocks.5.mlp.activation_fn": false,
705
+ "vision_tower.blocks.5.mlp.fc0": false,
706
+ "vision_tower.blocks.5.mlp.fc1": false,
707
+ "vision_tower.blocks.6.norm0": false,
708
+ "vision_tower.blocks.6.norm1": false,
709
+ "vision_tower.blocks.6.attn.wqkv": false,
710
+ "vision_tower.blocks.6.attn.wo": false,
711
+ "vision_tower.blocks.6.mlp.activation_fn": false,
712
+ "vision_tower.blocks.6.mlp.fc0": false,
713
+ "vision_tower.blocks.6.mlp.fc1": false,
714
+ "vision_tower.blocks.7.norm0": false,
715
+ "vision_tower.blocks.7.norm1": false,
716
+ "vision_tower.blocks.7.attn.wqkv": false,
717
+ "vision_tower.blocks.7.attn.wo": false,
718
+ "vision_tower.blocks.7.mlp.activation_fn": false,
719
+ "vision_tower.blocks.7.mlp.fc0": false,
720
+ "vision_tower.blocks.7.mlp.fc1": false,
721
+ "vision_tower.blocks.8.norm0": false,
722
+ "vision_tower.blocks.8.norm1": false,
723
+ "vision_tower.blocks.8.attn.wqkv": false,
724
+ "vision_tower.blocks.8.attn.wo": false,
725
+ "vision_tower.blocks.8.mlp.activation_fn": false,
726
+ "vision_tower.blocks.8.mlp.fc0": false,
727
+ "vision_tower.blocks.8.mlp.fc1": false,
728
+ "vision_tower.blocks.9.norm0": false,
729
+ "vision_tower.blocks.9.norm1": false,
730
+ "vision_tower.blocks.9.attn.wqkv": false,
731
+ "vision_tower.blocks.9.attn.wo": false,
732
+ "vision_tower.blocks.9.mlp.activation_fn": false,
733
+ "vision_tower.blocks.9.mlp.fc0": false,
734
+ "vision_tower.blocks.9.mlp.fc1": false,
735
+ "vision_tower.blocks.10.norm0": false,
736
+ "vision_tower.blocks.10.norm1": false,
737
+ "vision_tower.blocks.10.attn.wqkv": false,
738
+ "vision_tower.blocks.10.attn.wo": false,
739
+ "vision_tower.blocks.10.mlp.activation_fn": false,
740
+ "vision_tower.blocks.10.mlp.fc0": false,
741
+ "vision_tower.blocks.10.mlp.fc1": false,
742
+ "vision_tower.blocks.11.norm0": false,
743
+ "vision_tower.blocks.11.norm1": false,
744
+ "vision_tower.blocks.11.attn.wqkv": false,
745
+ "vision_tower.blocks.11.attn.wo": false,
746
+ "vision_tower.blocks.11.mlp.activation_fn": false,
747
+ "vision_tower.blocks.11.mlp.fc0": false,
748
+ "vision_tower.blocks.11.mlp.fc1": false,
749
+ "vision_tower.blocks.12.norm0": false,
750
+ "vision_tower.blocks.12.norm1": false,
751
+ "vision_tower.blocks.12.attn.wqkv": false,
752
+ "vision_tower.blocks.12.attn.wo": false,
753
+ "vision_tower.blocks.12.mlp.activation_fn": false,
754
+ "vision_tower.blocks.12.mlp.fc0": false,
755
+ "vision_tower.blocks.12.mlp.fc1": false,
756
+ "vision_tower.blocks.13.norm0": false,
757
+ "vision_tower.blocks.13.norm1": false,
758
+ "vision_tower.blocks.13.attn.wqkv": false,
759
+ "vision_tower.blocks.13.attn.wo": false,
760
+ "vision_tower.blocks.13.mlp.activation_fn": false,
761
+ "vision_tower.blocks.13.mlp.fc0": false,
762
+ "vision_tower.blocks.13.mlp.fc1": false,
763
+ "vision_tower.blocks.14.norm0": false,
764
+ "vision_tower.blocks.14.norm1": false,
765
+ "vision_tower.blocks.14.attn.wqkv": false,
766
+ "vision_tower.blocks.14.attn.wo": false,
767
+ "vision_tower.blocks.14.mlp.activation_fn": false,
768
+ "vision_tower.blocks.14.mlp.fc0": false,
769
+ "vision_tower.blocks.14.mlp.fc1": false,
770
+ "vision_tower.blocks.15.norm0": false,
771
+ "vision_tower.blocks.15.norm1": false,
772
+ "vision_tower.blocks.15.attn.wqkv": false,
773
+ "vision_tower.blocks.15.attn.wo": false,
774
+ "vision_tower.blocks.15.mlp.activation_fn": false,
775
+ "vision_tower.blocks.15.mlp.fc0": false,
776
+ "vision_tower.blocks.15.mlp.fc1": false,
777
+ "vision_tower.blocks.16.norm0": false,
778
+ "vision_tower.blocks.16.norm1": false,
779
+ "vision_tower.blocks.16.attn.wqkv": false,
780
+ "vision_tower.blocks.16.attn.wo": false,
781
+ "vision_tower.blocks.16.mlp.activation_fn": false,
782
+ "vision_tower.blocks.16.mlp.fc0": false,
783
+ "vision_tower.blocks.16.mlp.fc1": false,
784
+ "vision_tower.blocks.17.norm0": false,
785
+ "vision_tower.blocks.17.norm1": false,
786
+ "vision_tower.blocks.17.attn.wqkv": false,
787
+ "vision_tower.blocks.17.attn.wo": false,
788
+ "vision_tower.blocks.17.mlp.activation_fn": false,
789
+ "vision_tower.blocks.17.mlp.fc0": false,
790
+ "vision_tower.blocks.17.mlp.fc1": false,
791
+ "vision_tower.blocks.18.norm0": false,
792
+ "vision_tower.blocks.18.norm1": false,
793
+ "vision_tower.blocks.18.attn.wqkv": false,
794
+ "vision_tower.blocks.18.attn.wo": false,
795
+ "vision_tower.blocks.18.mlp.activation_fn": false,
796
+ "vision_tower.blocks.18.mlp.fc0": false,
797
+ "vision_tower.blocks.18.mlp.fc1": false,
798
+ "vision_tower.blocks.19.norm0": false,
799
+ "vision_tower.blocks.19.norm1": false,
800
+ "vision_tower.blocks.19.attn.wqkv": false,
801
+ "vision_tower.blocks.19.attn.wo": false,
802
+ "vision_tower.blocks.19.mlp.activation_fn": false,
803
+ "vision_tower.blocks.19.mlp.fc0": false,
804
+ "vision_tower.blocks.19.mlp.fc1": false,
805
+ "vision_tower.blocks.20.norm0": false,
806
+ "vision_tower.blocks.20.norm1": false,
807
+ "vision_tower.blocks.20.attn.wqkv": false,
808
+ "vision_tower.blocks.20.attn.wo": false,
809
+ "vision_tower.blocks.20.mlp.activation_fn": false,
810
+ "vision_tower.blocks.20.mlp.fc0": false,
811
+ "vision_tower.blocks.20.mlp.fc1": false,
812
+ "vision_tower.blocks.21.norm0": false,
813
+ "vision_tower.blocks.21.norm1": false,
814
+ "vision_tower.blocks.21.attn.wqkv": false,
815
+ "vision_tower.blocks.21.attn.wo": false,
816
+ "vision_tower.blocks.21.mlp.activation_fn": false,
817
+ "vision_tower.blocks.21.mlp.fc0": false,
818
+ "vision_tower.blocks.21.mlp.fc1": false,
819
+ "vision_tower.blocks.22.norm0": false,
820
+ "vision_tower.blocks.22.norm1": false,
821
+ "vision_tower.blocks.22.attn.wqkv": false,
822
+ "vision_tower.blocks.22.attn.wo": false,
823
+ "vision_tower.blocks.22.mlp.activation_fn": false,
824
+ "vision_tower.blocks.22.mlp.fc0": false,
825
+ "vision_tower.blocks.22.mlp.fc1": false,
826
+ "vision_tower.blocks.23.norm0": false,
827
+ "vision_tower.blocks.23.norm1": false,
828
+ "vision_tower.blocks.23.attn.wqkv": false,
829
+ "vision_tower.blocks.23.attn.wo": false,
830
+ "vision_tower.blocks.23.mlp.activation_fn": false,
831
+ "vision_tower.blocks.23.mlp.fc0": false,
832
+ "vision_tower.blocks.23.mlp.fc1": false,
833
+ "vision_tower.blocks.24.norm0": false,
834
+ "vision_tower.blocks.24.norm1": false,
835
+ "vision_tower.blocks.24.attn.wqkv": false,
836
+ "vision_tower.blocks.24.attn.wo": false,
837
+ "vision_tower.blocks.24.mlp.activation_fn": false,
838
+ "vision_tower.blocks.24.mlp.fc0": false,
839
+ "vision_tower.blocks.24.mlp.fc1": false,
840
+ "vision_tower.blocks.25.norm0": false,
841
+ "vision_tower.blocks.25.norm1": false,
842
+ "vision_tower.blocks.25.attn.wqkv": false,
843
+ "vision_tower.blocks.25.attn.wo": false,
844
+ "vision_tower.blocks.25.mlp.activation_fn": false,
845
+ "vision_tower.blocks.25.mlp.fc0": false,
846
+ "vision_tower.blocks.25.mlp.fc1": false,
847
+ "vision_tower.blocks.26.norm0": false,
848
+ "vision_tower.blocks.26.norm1": false,
849
+ "vision_tower.blocks.26.attn.wqkv": false,
850
+ "vision_tower.blocks.26.attn.wo": false,
851
+ "vision_tower.blocks.26.mlp.activation_fn": false,
852
+ "vision_tower.blocks.26.mlp.fc0": false,
853
+ "vision_tower.blocks.26.mlp.fc1": false,
854
+ "vision_tower.final_layernorm": false,
855
+ "vision_tower.rope_pos_emb": false,
856
+ "language_model.model.embed_tokens": true,
857
+ "language_model.model.layers.0.self_attn.q_proj": true,
858
+ "language_model.model.layers.0.self_attn.kv_a_proj_with_mqa": true,
859
+ "language_model.model.layers.0.self_attn.kv_a_layernorm": false,
860
+ "language_model.model.layers.0.self_attn.kv_b_proj": true,
861
+ "language_model.model.layers.0.self_attn.o_proj": true,
862
+ "language_model.model.layers.0.self_attn.rope": false,
863
+ "language_model.model.layers.0.mlp.gate_proj": true,
864
+ "language_model.model.layers.0.mlp.up_proj": true,
865
+ "language_model.model.layers.0.mlp.down_proj": true,
866
+ "language_model.model.layers.0.input_layernorm": false,
867
+ "language_model.model.layers.0.post_attention_layernorm": false,
868
+ "language_model.model.layers.1.self_attn.q_proj": true,
869
+ "language_model.model.layers.1.self_attn.kv_a_proj_with_mqa": true,
870
+ "language_model.model.layers.1.self_attn.kv_a_layernorm": false,
871
+ "language_model.model.layers.1.self_attn.kv_b_proj": true,
872
+ "language_model.model.layers.1.self_attn.o_proj": true,
873
+ "language_model.model.layers.1.self_attn.rope": false,
874
+ "language_model.model.layers.1.mlp.switch_mlp.gate_proj": true,
875
+ "language_model.model.layers.1.mlp.switch_mlp.up_proj": true,
876
+ "language_model.model.layers.1.mlp.switch_mlp.down_proj": true,
877
+ "language_model.model.layers.1.mlp.gate": false,
878
+ "language_model.model.layers.1.mlp.shared_experts.gate_proj": true,
879
+ "language_model.model.layers.1.mlp.shared_experts.up_proj": true,
880
+ "language_model.model.layers.1.mlp.shared_experts.down_proj": true,
881
+ "language_model.model.layers.1.input_layernorm": false,
882
+ "language_model.model.layers.1.post_attention_layernorm": false,
883
+ "language_model.model.layers.2.self_attn.q_proj": true,
884
+ "language_model.model.layers.2.self_attn.kv_a_proj_with_mqa": true,
885
+ "language_model.model.layers.2.self_attn.kv_a_layernorm": false,
886
+ "language_model.model.layers.2.self_attn.kv_b_proj": true,
887
+ "language_model.model.layers.2.self_attn.o_proj": true,
888
+ "language_model.model.layers.2.self_attn.rope": false,
889
+ "language_model.model.layers.2.mlp.switch_mlp.gate_proj": true,
890
+ "language_model.model.layers.2.mlp.switch_mlp.up_proj": true,
891
+ "language_model.model.layers.2.mlp.switch_mlp.down_proj": true,
892
+ "language_model.model.layers.2.mlp.gate": false,
893
+ "language_model.model.layers.2.mlp.shared_experts.gate_proj": true,
894
+ "language_model.model.layers.2.mlp.shared_experts.up_proj": true,
895
+ "language_model.model.layers.2.mlp.shared_experts.down_proj": true,
896
+ "language_model.model.layers.2.input_layernorm": false,
897
+ "language_model.model.layers.2.post_attention_layernorm": false,
898
+ "language_model.model.layers.3.self_attn.q_proj": true,
899
+ "language_model.model.layers.3.self_attn.kv_a_proj_with_mqa": true,
900
+ "language_model.model.layers.3.self_attn.kv_a_layernorm": false,
901
+ "language_model.model.layers.3.self_attn.kv_b_proj": true,
902
+ "language_model.model.layers.3.self_attn.o_proj": true,
903
+ "language_model.model.layers.3.self_attn.rope": false,
904
+ "language_model.model.layers.3.mlp.switch_mlp.gate_proj": true,
905
+ "language_model.model.layers.3.mlp.switch_mlp.up_proj": true,
906
+ "language_model.model.layers.3.mlp.switch_mlp.down_proj": true,
907
+ "language_model.model.layers.3.mlp.gate": false,
908
+ "language_model.model.layers.3.mlp.shared_experts.gate_proj": true,
909
+ "language_model.model.layers.3.mlp.shared_experts.up_proj": true,
910
+ "language_model.model.layers.3.mlp.shared_experts.down_proj": true,
911
+ "language_model.model.layers.3.input_layernorm": false,
912
+ "language_model.model.layers.3.post_attention_layernorm": false,
913
+ "language_model.model.layers.4.self_attn.q_proj": true,
914
+ "language_model.model.layers.4.self_attn.kv_a_proj_with_mqa": true,
915
+ "language_model.model.layers.4.self_attn.kv_a_layernorm": false,
916
+ "language_model.model.layers.4.self_attn.kv_b_proj": true,
917
+ "language_model.model.layers.4.self_attn.o_proj": true,
918
+ "language_model.model.layers.4.self_attn.rope": false,
919
+ "language_model.model.layers.4.mlp.switch_mlp.gate_proj": true,
920
+ "language_model.model.layers.4.mlp.switch_mlp.up_proj": true,
921
+ "language_model.model.layers.4.mlp.switch_mlp.down_proj": true,
922
+ "language_model.model.layers.4.mlp.gate": false,
923
+ "language_model.model.layers.4.mlp.shared_experts.gate_proj": true,
924
+ "language_model.model.layers.4.mlp.shared_experts.up_proj": true,
925
+ "language_model.model.layers.4.mlp.shared_experts.down_proj": true,
926
+ "language_model.model.layers.4.input_layernorm": false,
927
+ "language_model.model.layers.4.post_attention_layernorm": false,
928
+ "language_model.model.layers.5.self_attn.q_proj": true,
929
+ "language_model.model.layers.5.self_attn.kv_a_proj_with_mqa": true,
930
+ "language_model.model.layers.5.self_attn.kv_a_layernorm": false,
931
+ "language_model.model.layers.5.self_attn.kv_b_proj": true,
932
+ "language_model.model.layers.5.self_attn.o_proj": true,
933
+ "language_model.model.layers.5.self_attn.rope": false,
934
+ "language_model.model.layers.5.mlp.switch_mlp.gate_proj": true,
935
+ "language_model.model.layers.5.mlp.switch_mlp.up_proj": true,
936
+ "language_model.model.layers.5.mlp.switch_mlp.down_proj": true,
937
+ "language_model.model.layers.5.mlp.gate": false,
938
+ "language_model.model.layers.5.mlp.shared_experts.gate_proj": true,
939
+ "language_model.model.layers.5.mlp.shared_experts.up_proj": true,
940
+ "language_model.model.layers.5.mlp.shared_experts.down_proj": true,
941
+ "language_model.model.layers.5.input_layernorm": false,
942
+ "language_model.model.layers.5.post_attention_layernorm": false,
943
+ "language_model.model.layers.6.self_attn.q_proj": true,
944
+ "language_model.model.layers.6.self_attn.kv_a_proj_with_mqa": true,
945
+ "language_model.model.layers.6.self_attn.kv_a_layernorm": false,
946
+ "language_model.model.layers.6.self_attn.kv_b_proj": true,
947
+ "language_model.model.layers.6.self_attn.o_proj": true,
948
+ "language_model.model.layers.6.self_attn.rope": false,
949
+ "language_model.model.layers.6.mlp.switch_mlp.gate_proj": true,
950
+ "language_model.model.layers.6.mlp.switch_mlp.up_proj": true,
951
+ "language_model.model.layers.6.mlp.switch_mlp.down_proj": true,
952
+ "language_model.model.layers.6.mlp.gate": false,
953
+ "language_model.model.layers.6.mlp.shared_experts.gate_proj": true,
954
+ "language_model.model.layers.6.mlp.shared_experts.up_proj": true,
955
+ "language_model.model.layers.6.mlp.shared_experts.down_proj": true,
956
+ "language_model.model.layers.6.input_layernorm": false,
957
+ "language_model.model.layers.6.post_attention_layernorm": false,
958
+ "language_model.model.layers.7.self_attn.q_proj": true,
959
+ "language_model.model.layers.7.self_attn.kv_a_proj_with_mqa": true,
960
+ "language_model.model.layers.7.self_attn.kv_a_layernorm": false,
961
+ "language_model.model.layers.7.self_attn.kv_b_proj": true,
962
+ "language_model.model.layers.7.self_attn.o_proj": true,
963
+ "language_model.model.layers.7.self_attn.rope": false,
964
+ "language_model.model.layers.7.mlp.switch_mlp.gate_proj": true,
965
+ "language_model.model.layers.7.mlp.switch_mlp.up_proj": true,
966
+ "language_model.model.layers.7.mlp.switch_mlp.down_proj": true,
967
+ "language_model.model.layers.7.mlp.gate": false,
968
+ "language_model.model.layers.7.mlp.shared_experts.gate_proj": true,
969
+ "language_model.model.layers.7.mlp.shared_experts.up_proj": true,
970
+ "language_model.model.layers.7.mlp.shared_experts.down_proj": true,
971
+ "language_model.model.layers.7.input_layernorm": false,
972
+ "language_model.model.layers.7.post_attention_layernorm": false,
973
+ "language_model.model.layers.8.self_attn.q_proj": true,
974
+ "language_model.model.layers.8.self_attn.kv_a_proj_with_mqa": true,
975
+ "language_model.model.layers.8.self_attn.kv_a_layernorm": false,
976
+ "language_model.model.layers.8.self_attn.kv_b_proj": true,
977
+ "language_model.model.layers.8.self_attn.o_proj": true,
978
+ "language_model.model.layers.8.self_attn.rope": false,
979
+ "language_model.model.layers.8.mlp.switch_mlp.gate_proj": true,
980
+ "language_model.model.layers.8.mlp.switch_mlp.up_proj": true,
981
+ "language_model.model.layers.8.mlp.switch_mlp.down_proj": true,
982
+ "language_model.model.layers.8.mlp.gate": false,
983
+ "language_model.model.layers.8.mlp.shared_experts.gate_proj": true,
984
+ "language_model.model.layers.8.mlp.shared_experts.up_proj": true,
985
+ "language_model.model.layers.8.mlp.shared_experts.down_proj": true,
986
+ "language_model.model.layers.8.input_layernorm": false,
987
+ "language_model.model.layers.8.post_attention_layernorm": false,
988
+ "language_model.model.layers.9.self_attn.q_proj": true,
989
+ "language_model.model.layers.9.self_attn.kv_a_proj_with_mqa": true,
990
+ "language_model.model.layers.9.self_attn.kv_a_layernorm": false,
991
+ "language_model.model.layers.9.self_attn.kv_b_proj": true,
992
+ "language_model.model.layers.9.self_attn.o_proj": true,
993
+ "language_model.model.layers.9.self_attn.rope": false,
994
+ "language_model.model.layers.9.mlp.switch_mlp.gate_proj": true,
995
+ "language_model.model.layers.9.mlp.switch_mlp.up_proj": true,
996
+ "language_model.model.layers.9.mlp.switch_mlp.down_proj": true,
997
+ "language_model.model.layers.9.mlp.gate": false,
998
+ "language_model.model.layers.9.mlp.shared_experts.gate_proj": true,
999
+ "language_model.model.layers.9.mlp.shared_experts.up_proj": true,
1000
+ "language_model.model.layers.9.mlp.shared_experts.down_proj": true,
1001
+ "language_model.model.layers.9.input_layernorm": false,
1002
+ "language_model.model.layers.9.post_attention_layernorm": false,
1003
+ "language_model.model.layers.10.self_attn.q_proj": true,
1004
+ "language_model.model.layers.10.self_attn.kv_a_proj_with_mqa": true,
1005
+ "language_model.model.layers.10.self_attn.kv_a_layernorm": false,
1006
+ "language_model.model.layers.10.self_attn.kv_b_proj": true,
1007
+ "language_model.model.layers.10.self_attn.o_proj": true,
1008
+ "language_model.model.layers.10.self_attn.rope": false,
1009
+ "language_model.model.layers.10.mlp.switch_mlp.gate_proj": true,
1010
+ "language_model.model.layers.10.mlp.switch_mlp.up_proj": true,
1011
+ "language_model.model.layers.10.mlp.switch_mlp.down_proj": true,
1012
+ "language_model.model.layers.10.mlp.gate": false,
1013
+ "language_model.model.layers.10.mlp.shared_experts.gate_proj": true,
1014
+ "language_model.model.layers.10.mlp.shared_experts.up_proj": true,
1015
+ "language_model.model.layers.10.mlp.shared_experts.down_proj": true,
1016
+ "language_model.model.layers.10.input_layernorm": false,
1017
+ "language_model.model.layers.10.post_attention_layernorm": false,
1018
+ "language_model.model.layers.11.self_attn.q_proj": true,
1019
+ "language_model.model.layers.11.self_attn.kv_a_proj_with_mqa": true,
1020
+ "language_model.model.layers.11.self_attn.kv_a_layernorm": false,
1021
+ "language_model.model.layers.11.self_attn.kv_b_proj": true,
1022
+ "language_model.model.layers.11.self_attn.o_proj": true,
1023
+ "language_model.model.layers.11.self_attn.rope": false,
1024
+ "language_model.model.layers.11.mlp.switch_mlp.gate_proj": true,
1025
+ "language_model.model.layers.11.mlp.switch_mlp.up_proj": true,
1026
+ "language_model.model.layers.11.mlp.switch_mlp.down_proj": true,
1027
+ "language_model.model.layers.11.mlp.gate": false,
1028
+ "language_model.model.layers.11.mlp.shared_experts.gate_proj": true,
1029
+ "language_model.model.layers.11.mlp.shared_experts.up_proj": true,
1030
+ "language_model.model.layers.11.mlp.shared_experts.down_proj": true,
1031
+ "language_model.model.layers.11.input_layernorm": false,
1032
+ "language_model.model.layers.11.post_attention_layernorm": false,
1033
+ "language_model.model.layers.12.self_attn.q_proj": true,
1034
+ "language_model.model.layers.12.self_attn.kv_a_proj_with_mqa": true,
1035
+ "language_model.model.layers.12.self_attn.kv_a_layernorm": false,
1036
+ "language_model.model.layers.12.self_attn.kv_b_proj": true,
1037
+ "language_model.model.layers.12.self_attn.o_proj": true,
1038
+ "language_model.model.layers.12.self_attn.rope": false,
1039
+ "language_model.model.layers.12.mlp.switch_mlp.gate_proj": true,
1040
+ "language_model.model.layers.12.mlp.switch_mlp.up_proj": true,
1041
+ "language_model.model.layers.12.mlp.switch_mlp.down_proj": true,
1042
+ "language_model.model.layers.12.mlp.gate": false,
1043
+ "language_model.model.layers.12.mlp.shared_experts.gate_proj": true,
1044
+ "language_model.model.layers.12.mlp.shared_experts.up_proj": true,
1045
+ "language_model.model.layers.12.mlp.shared_experts.down_proj": true,
1046
+ "language_model.model.layers.12.input_layernorm": false,
1047
+ "language_model.model.layers.12.post_attention_layernorm": false,
1048
+ "language_model.model.layers.13.self_attn.q_proj": true,
1049
+ "language_model.model.layers.13.self_attn.kv_a_proj_with_mqa": true,
1050
+ "language_model.model.layers.13.self_attn.kv_a_layernorm": false,
1051
+ "language_model.model.layers.13.self_attn.kv_b_proj": true,
1052
+ "language_model.model.layers.13.self_attn.o_proj": true,
1053
+ "language_model.model.layers.13.self_attn.rope": false,
1054
+ "language_model.model.layers.13.mlp.switch_mlp.gate_proj": true,
1055
+ "language_model.model.layers.13.mlp.switch_mlp.up_proj": true,
1056
+ "language_model.model.layers.13.mlp.switch_mlp.down_proj": true,
1057
+ "language_model.model.layers.13.mlp.gate": false,
1058
+ "language_model.model.layers.13.mlp.shared_experts.gate_proj": true,
1059
+ "language_model.model.layers.13.mlp.shared_experts.up_proj": true,
1060
+ "language_model.model.layers.13.mlp.shared_experts.down_proj": true,
1061
+ "language_model.model.layers.13.input_layernorm": false,
1062
+ "language_model.model.layers.13.post_attention_layernorm": false,
1063
+ "language_model.model.layers.14.self_attn.q_proj": true,
1064
+ "language_model.model.layers.14.self_attn.kv_a_proj_with_mqa": true,
1065
+ "language_model.model.layers.14.self_attn.kv_a_layernorm": false,
1066
+ "language_model.model.layers.14.self_attn.kv_b_proj": true,
1067
+ "language_model.model.layers.14.self_attn.o_proj": true,
1068
+ "language_model.model.layers.14.self_attn.rope": false,
1069
+ "language_model.model.layers.14.mlp.switch_mlp.gate_proj": true,
1070
+ "language_model.model.layers.14.mlp.switch_mlp.up_proj": true,
1071
+ "language_model.model.layers.14.mlp.switch_mlp.down_proj": true,
1072
+ "language_model.model.layers.14.mlp.gate": false,
1073
+ "language_model.model.layers.14.mlp.shared_experts.gate_proj": true,
1074
+ "language_model.model.layers.14.mlp.shared_experts.up_proj": true,
1075
+ "language_model.model.layers.14.mlp.shared_experts.down_proj": true,
1076
+ "language_model.model.layers.14.input_layernorm": false,
1077
+ "language_model.model.layers.14.post_attention_layernorm": false,
1078
+ "language_model.model.layers.15.self_attn.q_proj": true,
1079
+ "language_model.model.layers.15.self_attn.kv_a_proj_with_mqa": true,
1080
+ "language_model.model.layers.15.self_attn.kv_a_layernorm": false,
1081
+ "language_model.model.layers.15.self_attn.kv_b_proj": true,
1082
+ "language_model.model.layers.15.self_attn.o_proj": true,
1083
+ "language_model.model.layers.15.self_attn.rope": false,
1084
+ "language_model.model.layers.15.mlp.switch_mlp.gate_proj": true,
1085
+ "language_model.model.layers.15.mlp.switch_mlp.up_proj": true,
1086
+ "language_model.model.layers.15.mlp.switch_mlp.down_proj": true,
1087
+ "language_model.model.layers.15.mlp.gate": false,
1088
+ "language_model.model.layers.15.mlp.shared_experts.gate_proj": true,
1089
+ "language_model.model.layers.15.mlp.shared_experts.up_proj": true,
1090
+ "language_model.model.layers.15.mlp.shared_experts.down_proj": true,
1091
+ "language_model.model.layers.15.input_layernorm": false,
1092
+ "language_model.model.layers.15.post_attention_layernorm": false,
1093
+ "language_model.model.layers.16.self_attn.q_proj": true,
1094
+ "language_model.model.layers.16.self_attn.kv_a_proj_with_mqa": true,
1095
+ "language_model.model.layers.16.self_attn.kv_a_layernorm": false,
1096
+ "language_model.model.layers.16.self_attn.kv_b_proj": true,
1097
+ "language_model.model.layers.16.self_attn.o_proj": true,
1098
+ "language_model.model.layers.16.self_attn.rope": false,
1099
+ "language_model.model.layers.16.mlp.switch_mlp.gate_proj": true,
1100
+ "language_model.model.layers.16.mlp.switch_mlp.up_proj": true,
1101
+ "language_model.model.layers.16.mlp.switch_mlp.down_proj": true,
1102
+ "language_model.model.layers.16.mlp.gate": false,
1103
+ "language_model.model.layers.16.mlp.shared_experts.gate_proj": true,
1104
+ "language_model.model.layers.16.mlp.shared_experts.up_proj": true,
1105
+ "language_model.model.layers.16.mlp.shared_experts.down_proj": true,
1106
+ "language_model.model.layers.16.input_layernorm": false,
1107
+ "language_model.model.layers.16.post_attention_layernorm": false,
1108
+ "language_model.model.layers.17.self_attn.q_proj": true,
1109
+ "language_model.model.layers.17.self_attn.kv_a_proj_with_mqa": true,
1110
+ "language_model.model.layers.17.self_attn.kv_a_layernorm": false,
1111
+ "language_model.model.layers.17.self_attn.kv_b_proj": true,
1112
+ "language_model.model.layers.17.self_attn.o_proj": true,
1113
+ "language_model.model.layers.17.self_attn.rope": false,
1114
+ "language_model.model.layers.17.mlp.switch_mlp.gate_proj": true,
1115
+ "language_model.model.layers.17.mlp.switch_mlp.up_proj": true,
1116
+ "language_model.model.layers.17.mlp.switch_mlp.down_proj": true,
1117
+ "language_model.model.layers.17.mlp.gate": false,
1118
+ "language_model.model.layers.17.mlp.shared_experts.gate_proj": true,
1119
+ "language_model.model.layers.17.mlp.shared_experts.up_proj": true,
1120
+ "language_model.model.layers.17.mlp.shared_experts.down_proj": true,
1121
+ "language_model.model.layers.17.input_layernorm": false,
1122
+ "language_model.model.layers.17.post_attention_layernorm": false,
1123
+ "language_model.model.layers.18.self_attn.q_proj": true,
1124
+ "language_model.model.layers.18.self_attn.kv_a_proj_with_mqa": true,
1125
+ "language_model.model.layers.18.self_attn.kv_a_layernorm": false,
1126
+ "language_model.model.layers.18.self_attn.kv_b_proj": true,
1127
+ "language_model.model.layers.18.self_attn.o_proj": true,
1128
+ "language_model.model.layers.18.self_attn.rope": false,
1129
+ "language_model.model.layers.18.mlp.switch_mlp.gate_proj": true,
1130
+ "language_model.model.layers.18.mlp.switch_mlp.up_proj": true,
1131
+ "language_model.model.layers.18.mlp.switch_mlp.down_proj": true,
1132
+ "language_model.model.layers.18.mlp.gate": false,
1133
+ "language_model.model.layers.18.mlp.shared_experts.gate_proj": true,
1134
+ "language_model.model.layers.18.mlp.shared_experts.up_proj": true,
1135
+ "language_model.model.layers.18.mlp.shared_experts.down_proj": true,
1136
+ "language_model.model.layers.18.input_layernorm": false,
1137
+ "language_model.model.layers.18.post_attention_layernorm": false,
1138
+ "language_model.model.layers.19.self_attn.q_proj": true,
1139
+ "language_model.model.layers.19.self_attn.kv_a_proj_with_mqa": true,
1140
+ "language_model.model.layers.19.self_attn.kv_a_layernorm": false,
1141
+ "language_model.model.layers.19.self_attn.kv_b_proj": true,
1142
+ "language_model.model.layers.19.self_attn.o_proj": true,
1143
+ "language_model.model.layers.19.self_attn.rope": false,
1144
+ "language_model.model.layers.19.mlp.switch_mlp.gate_proj": true,
1145
+ "language_model.model.layers.19.mlp.switch_mlp.up_proj": true,
1146
+ "language_model.model.layers.19.mlp.switch_mlp.down_proj": true,
1147
+ "language_model.model.layers.19.mlp.gate": false,
1148
+ "language_model.model.layers.19.mlp.shared_experts.gate_proj": true,
1149
+ "language_model.model.layers.19.mlp.shared_experts.up_proj": true,
1150
+ "language_model.model.layers.19.mlp.shared_experts.down_proj": true,
1151
+ "language_model.model.layers.19.input_layernorm": false,
1152
+ "language_model.model.layers.19.post_attention_layernorm": false,
1153
+ "language_model.model.layers.20.self_attn.q_proj": true,
1154
+ "language_model.model.layers.20.self_attn.kv_a_proj_with_mqa": true,
1155
+ "language_model.model.layers.20.self_attn.kv_a_layernorm": false,
1156
+ "language_model.model.layers.20.self_attn.kv_b_proj": true,
1157
+ "language_model.model.layers.20.self_attn.o_proj": true,
1158
+ "language_model.model.layers.20.self_attn.rope": false,
1159
+ "language_model.model.layers.20.mlp.switch_mlp.gate_proj": true,
1160
+ "language_model.model.layers.20.mlp.switch_mlp.up_proj": true,
1161
+ "language_model.model.layers.20.mlp.switch_mlp.down_proj": true,
1162
+ "language_model.model.layers.20.mlp.gate": false,
1163
+ "language_model.model.layers.20.mlp.shared_experts.gate_proj": true,
1164
+ "language_model.model.layers.20.mlp.shared_experts.up_proj": true,
1165
+ "language_model.model.layers.20.mlp.shared_experts.down_proj": true,
1166
+ "language_model.model.layers.20.input_layernorm": false,
1167
+ "language_model.model.layers.20.post_attention_layernorm": false,
1168
+ "language_model.model.layers.21.self_attn.q_proj": true,
1169
+ "language_model.model.layers.21.self_attn.kv_a_proj_with_mqa": true,
1170
+ "language_model.model.layers.21.self_attn.kv_a_layernorm": false,
1171
+ "language_model.model.layers.21.self_attn.kv_b_proj": true,
1172
+ "language_model.model.layers.21.self_attn.o_proj": true,
1173
+ "language_model.model.layers.21.self_attn.rope": false,
1174
+ "language_model.model.layers.21.mlp.switch_mlp.gate_proj": true,
1175
+ "language_model.model.layers.21.mlp.switch_mlp.up_proj": true,
1176
+ "language_model.model.layers.21.mlp.switch_mlp.down_proj": true,
1177
+ "language_model.model.layers.21.mlp.gate": false,
1178
+ "language_model.model.layers.21.mlp.shared_experts.gate_proj": true,
1179
+ "language_model.model.layers.21.mlp.shared_experts.up_proj": true,
1180
+ "language_model.model.layers.21.mlp.shared_experts.down_proj": true,
1181
+ "language_model.model.layers.21.input_layernorm": false,
1182
+ "language_model.model.layers.21.post_attention_layernorm": false,
1183
+ "language_model.model.layers.22.self_attn.q_proj": true,
1184
+ "language_model.model.layers.22.self_attn.kv_a_proj_with_mqa": true,
1185
+ "language_model.model.layers.22.self_attn.kv_a_layernorm": false,
1186
+ "language_model.model.layers.22.self_attn.kv_b_proj": true,
1187
+ "language_model.model.layers.22.self_attn.o_proj": true,
1188
+ "language_model.model.layers.22.self_attn.rope": false,
1189
+ "language_model.model.layers.22.mlp.switch_mlp.gate_proj": true,
1190
+ "language_model.model.layers.22.mlp.switch_mlp.up_proj": true,
1191
+ "language_model.model.layers.22.mlp.switch_mlp.down_proj": true,
1192
+ "language_model.model.layers.22.mlp.gate": false,
1193
+ "language_model.model.layers.22.mlp.shared_experts.gate_proj": true,
1194
+ "language_model.model.layers.22.mlp.shared_experts.up_proj": true,
1195
+ "language_model.model.layers.22.mlp.shared_experts.down_proj": true,
1196
+ "language_model.model.layers.22.input_layernorm": false,
1197
+ "language_model.model.layers.22.post_attention_layernorm": false,
1198
+ "language_model.model.layers.23.self_attn.q_proj": true,
1199
+ "language_model.model.layers.23.self_attn.kv_a_proj_with_mqa": true,
1200
+ "language_model.model.layers.23.self_attn.kv_a_layernorm": false,
1201
+ "language_model.model.layers.23.self_attn.kv_b_proj": true,
1202
+ "language_model.model.layers.23.self_attn.o_proj": true,
1203
+ "language_model.model.layers.23.self_attn.rope": false,
1204
+ "language_model.model.layers.23.mlp.switch_mlp.gate_proj": true,
1205
+ "language_model.model.layers.23.mlp.switch_mlp.up_proj": true,
1206
+ "language_model.model.layers.23.mlp.switch_mlp.down_proj": true,
1207
+ "language_model.model.layers.23.mlp.gate": false,
1208
+ "language_model.model.layers.23.mlp.shared_experts.gate_proj": true,
1209
+ "language_model.model.layers.23.mlp.shared_experts.up_proj": true,
1210
+ "language_model.model.layers.23.mlp.shared_experts.down_proj": true,
1211
+ "language_model.model.layers.23.input_layernorm": false,
1212
+ "language_model.model.layers.23.post_attention_layernorm": false,
1213
+ "language_model.model.layers.24.self_attn.q_proj": true,
1214
+ "language_model.model.layers.24.self_attn.kv_a_proj_with_mqa": true,
1215
+ "language_model.model.layers.24.self_attn.kv_a_layernorm": false,
1216
+ "language_model.model.layers.24.self_attn.kv_b_proj": true,
1217
+ "language_model.model.layers.24.self_attn.o_proj": true,
1218
+ "language_model.model.layers.24.self_attn.rope": false,
1219
+ "language_model.model.layers.24.mlp.switch_mlp.gate_proj": true,
1220
+ "language_model.model.layers.24.mlp.switch_mlp.up_proj": true,
1221
+ "language_model.model.layers.24.mlp.switch_mlp.down_proj": true,
1222
+ "language_model.model.layers.24.mlp.gate": false,
1223
+ "language_model.model.layers.24.mlp.shared_experts.gate_proj": true,
1224
+ "language_model.model.layers.24.mlp.shared_experts.up_proj": true,
1225
+ "language_model.model.layers.24.mlp.shared_experts.down_proj": true,
1226
+ "language_model.model.layers.24.input_layernorm": false,
1227
+ "language_model.model.layers.24.post_attention_layernorm": false,
1228
+ "language_model.model.layers.25.self_attn.q_proj": true,
1229
+ "language_model.model.layers.25.self_attn.kv_a_proj_with_mqa": true,
1230
+ "language_model.model.layers.25.self_attn.kv_a_layernorm": false,
1231
+ "language_model.model.layers.25.self_attn.kv_b_proj": true,
1232
+ "language_model.model.layers.25.self_attn.o_proj": true,
1233
+ "language_model.model.layers.25.self_attn.rope": false,
1234
+ "language_model.model.layers.25.mlp.switch_mlp.gate_proj": true,
1235
+ "language_model.model.layers.25.mlp.switch_mlp.up_proj": true,
1236
+ "language_model.model.layers.25.mlp.switch_mlp.down_proj": true,
1237
+ "language_model.model.layers.25.mlp.gate": false,
1238
+ "language_model.model.layers.25.mlp.shared_experts.gate_proj": true,
1239
+ "language_model.model.layers.25.mlp.shared_experts.up_proj": true,
1240
+ "language_model.model.layers.25.mlp.shared_experts.down_proj": true,
1241
+ "language_model.model.layers.25.input_layernorm": false,
1242
+ "language_model.model.layers.25.post_attention_layernorm": false,
1243
+ "language_model.model.layers.26.self_attn.q_proj": true,
1244
+ "language_model.model.layers.26.self_attn.kv_a_proj_with_mqa": true,
1245
+ "language_model.model.layers.26.self_attn.kv_a_layernorm": false,
1246
+ "language_model.model.layers.26.self_attn.kv_b_proj": true,
1247
+ "language_model.model.layers.26.self_attn.o_proj": true,
1248
+ "language_model.model.layers.26.self_attn.rope": false,
1249
+ "language_model.model.layers.26.mlp.switch_mlp.gate_proj": true,
1250
+ "language_model.model.layers.26.mlp.switch_mlp.up_proj": true,
1251
+ "language_model.model.layers.26.mlp.switch_mlp.down_proj": true,
1252
+ "language_model.model.layers.26.mlp.gate": false,
1253
+ "language_model.model.layers.26.mlp.shared_experts.gate_proj": true,
1254
+ "language_model.model.layers.26.mlp.shared_experts.up_proj": true,
1255
+ "language_model.model.layers.26.mlp.shared_experts.down_proj": true,
1256
+ "language_model.model.layers.26.input_layernorm": false,
1257
+ "language_model.model.layers.26.post_attention_layernorm": false,
1258
+ "language_model.model.norm": false,
1259
+ "language_model.lm_head": true,
1260
+ "multi_modal_projector.pre_norm": false,
1261
+ "multi_modal_projector.linear_1": true,
1262
+ "multi_modal_projector.act": false,
1263
+ "multi_modal_projector.linear_2": true
1264
+ },
1265
+ "remove_invalid_values": false,
1266
+ "repetition_penalty": 1.0,
1267
+ "return_dict": true,
1268
+ "return_dict_in_generate": false,
1269
+ "sep_token_id": null,
1270
+ "suppress_tokens": null,
1271
+ "task_specific_params": null,
1272
+ "temperature": 1.0,
1273
+ "text_config": {
1274
+ "vocab_size": 163840,
1275
+ "max_position_embeddings": 131072,
1276
+ "hidden_size": 2048,
1277
+ "intermediate_size": 11264,
1278
+ "moe_intermediate_size": 1408,
1279
+ "num_hidden_layers": 27,
1280
+ "num_nextn_predict_layers": 1,
1281
+ "num_attention_heads": 16,
1282
+ "n_shared_experts": 2,
1283
+ "n_routed_experts": 64,
1284
+ "ep_size": 1,
1285
+ "routed_scaling_factor": 2.446,
1286
+ "kv_lora_rank": 512,
1287
+ "q_lora_rank": null,
1288
+ "qk_rope_head_dim": 64,
1289
+ "v_head_dim": 128,
1290
+ "qk_nope_head_dim": 128,
1291
+ "topk_method": "noaux_tc",
1292
+ "n_group": 1,
1293
+ "topk_group": 1,
1294
+ "num_experts_per_tok": 6,
1295
+ "moe_layer_freq": 1,
1296
+ "first_k_dense_replace": 1,
1297
+ "norm_topk_prob": true,
1298
+ "scoring_func": "sigmoid",
1299
+ "aux_loss_alpha": 0.001,
1300
+ "seq_aux": true,
1301
+ "num_key_value_heads": 16,
1302
+ "hidden_act": "silu",
1303
+ "initializer_range": 0.02,
1304
+ "rms_norm_eps": 1e-05,
1305
+ "pretraining_tp": 1,
1306
+ "use_cache": true,
1307
+ "rope_theta": 800000.0,
1308
+ "rope_scaling": null,
1309
+ "attention_bias": false,
1310
+ "attention_dropout": 0.0,
1311
+ "return_dict": true,
1312
+ "output_hidden_states": false,
1313
+ "torchscript": false,
1314
+ "torch_dtype": "bfloat16",
1315
+ "use_bfloat16": false,
1316
+ "tf_legacy_loss": false,
1317
+ "pruned_heads": {},
1318
+ "tie_word_embeddings": false,
1319
+ "chunk_size_feed_forward": 0,
1320
+ "is_encoder_decoder": false,
1321
+ "is_decoder": false,
1322
+ "cross_attention_hidden_size": null,
1323
+ "add_cross_attention": false,
1324
+ "tie_encoder_decoder": false,
1325
+ "max_length": 20,
1326
+ "min_length": 0,
1327
+ "do_sample": false,
1328
+ "early_stopping": false,
1329
+ "num_beams": 1,
1330
+ "num_beam_groups": 1,
1331
+ "diversity_penalty": 0.0,
1332
+ "temperature": 1.0,
1333
+ "top_k": 50,
1334
+ "top_p": 1.0,
1335
+ "typical_p": 1.0,
1336
+ "repetition_penalty": 1.0,
1337
+ "length_penalty": 1.0,
1338
+ "no_repeat_ngram_size": 0,
1339
+ "encoder_no_repeat_ngram_size": 0,
1340
+ "bad_words_ids": null,
1341
+ "num_return_sequences": 1,
1342
+ "output_scores": false,
1343
+ "return_dict_in_generate": false,
1344
+ "forced_bos_token_id": null,
1345
+ "forced_eos_token_id": null,
1346
+ "remove_invalid_values": false,
1347
+ "exponential_decay_length_penalty": null,
1348
+ "suppress_tokens": null,
1349
+ "begin_suppress_tokens": null,
1350
+ "architectures": null,
1351
+ "finetuning_task": null,
1352
+ "id2label": {
1353
+ "0": "LABEL_0",
1354
+ "1": "LABEL_1"
1355
+ },
1356
+ "label2id": {
1357
+ "LABEL_0": 0,
1358
+ "LABEL_1": 1
1359
+ },
1360
+ "tokenizer_class": null,
1361
+ "prefix": null,
1362
+ "bos_token_id": 163584,
1363
+ "pad_token_id": 163839,
1364
+ "eos_token_id": 163585,
1365
+ "sep_token_id": null,
1366
+ "decoder_start_token_id": null,
1367
+ "task_specific_params": null,
1368
+ "problem_type": null,
1369
+ "_name_or_path": "",
1370
+ "model_type": "deepseek_v3",
1371
+ "output_attentions": false
1372
+ },
1373
+ "tf_legacy_loss": false,
1374
+ "tie_encoder_decoder": false,
1375
+ "tie_word_embeddings": false,
1376
+ "tokenizer_class": null,
1377
+ "top_k": 50,
1378
+ "top_p": 1.0,
1379
+ "torchscript": false,
1380
+ "transformers_version": "4.53.3",
1381
+ "typical_p": 1.0,
1382
+ "use_bfloat16": false,
1383
+ "vision_config": {
1384
+ "return_dict": true,
1385
+ "output_hidden_states": false,
1386
+ "torchscript": false,
1387
+ "torch_dtype": "bfloat16",
1388
+ "use_bfloat16": false,
1389
+ "tf_legacy_loss": false,
1390
+ "pruned_heads": {},
1391
+ "tie_word_embeddings": true,
1392
+ "chunk_size_feed_forward": 0,
1393
+ "is_encoder_decoder": false,
1394
+ "is_decoder": false,
1395
+ "cross_attention_hidden_size": null,
1396
+ "add_cross_attention": false,
1397
+ "tie_encoder_decoder": false,
1398
+ "max_length": 20,
1399
+ "min_length": 0,
1400
+ "do_sample": false,
1401
+ "early_stopping": false,
1402
+ "num_beams": 1,
1403
+ "num_beam_groups": 1,
1404
+ "diversity_penalty": 0.0,
1405
+ "temperature": 1.0,
1406
+ "top_k": 50,
1407
+ "top_p": 1.0,
1408
+ "typical_p": 1.0,
1409
+ "repetition_penalty": 1.0,
1410
+ "length_penalty": 1.0,
1411
+ "no_repeat_ngram_size": 0,
1412
+ "encoder_no_repeat_ngram_size": 0,
1413
+ "bad_words_ids": null,
1414
+ "num_return_sequences": 1,
1415
+ "output_scores": false,
1416
+ "return_dict_in_generate": false,
1417
+ "forced_bos_token_id": null,
1418
+ "forced_eos_token_id": null,
1419
+ "remove_invalid_values": false,
1420
+ "exponential_decay_length_penalty": null,
1421
+ "suppress_tokens": null,
1422
+ "begin_suppress_tokens": null,
1423
+ "architectures": null,
1424
+ "finetuning_task": null,
1425
+ "id2label": {
1426
+ "0": "LABEL_0",
1427
+ "1": "LABEL_1"
1428
+ },
1429
+ "label2id": {
1430
+ "LABEL_0": 0,
1431
+ "LABEL_1": 1
1432
+ },
1433
+ "tokenizer_class": null,
1434
+ "prefix": null,
1435
+ "bos_token_id": null,
1436
+ "pad_token_id": null,
1437
+ "eos_token_id": null,
1438
+ "sep_token_id": null,
1439
+ "decoder_start_token_id": null,
1440
+ "task_specific_params": null,
1441
+ "problem_type": null,
1442
+ "_name_or_path": "",
1443
+ "model_type": "moonvit",
1444
+ "patch_size": 14,
1445
+ "init_pos_emb_height": 64,
1446
+ "init_pos_emb_width": 64,
1447
+ "num_hidden_layers": 27,
1448
+ "num_attention_heads": 16,
1449
+ "hidden_size": 1152,
1450
+ "intermediate_size": 4304,
1451
+ "merge_kernel_size": [
1452
+ 2,
1453
+ 2
1454
+ ],
1455
+ "output_attentions": false
1456
+ },
1457
+ "vocab_size": 163840
1458
+ }
configuration_kimi_vl.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.configuration_utils import PretrainedConfig
2
+ from transformers.utils import logging
3
+ from typing import Optional, Union
4
+
5
+ logger = logging.get_logger(__name__)
6
+
7
+ DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
8
+
9
+
10
+ class DeepseekV3Config(PretrainedConfig):
11
+ r"""
12
+ This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
13
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
14
+ defaults will yield a similar configuration to that of the DeepSeek-V3.
15
+
16
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
17
+ documentation from [`PretrainedConfig`] for more information.
18
+
19
+ Copy from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/configuration_deepseek.py
20
+
21
+ Args:
22
+ vocab_size (`int`, *optional*, defaults to 129280):
23
+ Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
24
+ `inputs_ids` passed when calling [`DeepseekV3Model`]
25
+ hidden_size (`int`, *optional*, defaults to 4096):
26
+ Dimension of the hidden representations.
27
+ intermediate_size (`int`, *optional*, defaults to 11008):
28
+ Dimension of the MLP representations.
29
+ moe_intermediate_size (`int`, *optional*, defaults to 1407):
30
+ Dimension of the MoE representations.
31
+ num_hidden_layers (`int`, *optional*, defaults to 32):
32
+ Number of hidden layers in the Transformer decoder.
33
+ num_nextn_predict_layers (`int`, *optional*, defaults to 1):
34
+ Number of nextn predict layers in the DeepSeekV3 Model.
35
+ num_attention_heads (`int`, *optional*, defaults to 32):
36
+ Number of attention heads for each attention layer in the Transformer decoder.
37
+ n_shared_experts (`int`, *optional*, defaults to None):
38
+ Number of shared experts, None means dense model.
39
+ n_routed_experts (`int`, *optional*, defaults to None):
40
+ Number of routed experts, None means dense model.
41
+ routed_scaling_factor (`float`, *optional*, defaults to 1.0):
42
+ Scaling factor or routed experts.
43
+ topk_method (`str`, *optional*, defaults to `gready`):
44
+ Topk method used in routed gate.
45
+ n_group (`int`, *optional*, defaults to None):
46
+ Number of groups for routed experts.
47
+ topk_group (`int`, *optional*, defaults to None):
48
+ Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
49
+ num_experts_per_tok (`int`, *optional*, defaults to None):
50
+ Number of selected experts, None means dense model.
51
+ moe_layer_freq (`int`, *optional*, defaults to 1):
52
+ The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
53
+ first_k_dense_replace (`int`, *optional*, defaults to 0):
54
+ Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
55
+ \--k dense layers--/
56
+ norm_topk_prob (`bool`, *optional*, defaults to False):
57
+ Whether to normalize the weights of the routed experts.
58
+ scoring_func (`str`, *optional*, defaults to 'softmax'):
59
+ Method of computing expert weights.
60
+ aux_loss_alpha (`float`, *optional*, defaults to 0.001):
61
+ Auxiliary loss weight coefficient.
62
+ seq_aux = (`bool`, *optional*, defaults to True):
63
+ Whether to compute the auxiliary loss for each individual sample.
64
+ num_key_value_heads (`int`, *optional*):
65
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
66
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
67
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
68
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
69
+ by meanpooling all the original heads within that group. For more details checkout [this
70
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
71
+ `num_attention_heads`.
72
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
73
+ The non-linear activation function (function or string) in the decoder.
74
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
75
+ The maximum sequence length that this model might ever be used with.
76
+ initializer_range (`float`, *optional*, defaults to 0.02):
77
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
78
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
79
+ The epsilon used by the rms normalization layers.
80
+ use_cache (`bool`, *optional*, defaults to `True`):
81
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
82
+ relevant if `config.is_decoder=True`.
83
+ pad_token_id (`int`, *optional*):
84
+ Padding token id.
85
+ bos_token_id (`int`, *optional*, defaults to 1):
86
+ Beginning of stream token id.
87
+ eos_token_id (`int`, *optional*, defaults to 2):
88
+ End of stream token id.
89
+ pretraining_tp (`int`, *optional*, defaults to 1):
90
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
91
+ document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
92
+ necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
93
+ issue](https://github.com/pytorch/pytorch/issues/76232).
94
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
95
+ Whether to tie weight embeddings
96
+ rope_theta (`float`, *optional*, defaults to 10000.0):
97
+ The base period of the RoPE embeddings.
98
+ rope_scaling (`Dict`, *optional*):
99
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
100
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
101
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
102
+ `max_position_embeddings` to the expected new maximum.
103
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
104
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
105
+ attention_dropout (`float`, *optional*, defaults to 0.0):
106
+ The dropout ratio for the attention probabilities.
107
+
108
+ ```python
109
+ >>> from transformers import DeepseekV3Model, DeepseekV3Config
110
+
111
+ >>> # Initializing a Deepseek-V3 style configuration
112
+ >>> configuration = DeepseekV3Config()
113
+
114
+ >>> # Accessing the model configuration
115
+ >>> configuration = model.config
116
+ ```"""
117
+
118
+ model_type = "deepseek_v3"
119
+ keys_to_ignore_at_inference = ["past_key_values"]
120
+
121
+ def __init__(
122
+ self,
123
+ vocab_size=129280,
124
+ hidden_size=7168,
125
+ intermediate_size=18432,
126
+ moe_intermediate_size=2048,
127
+ num_hidden_layers=61,
128
+ num_nextn_predict_layers=1,
129
+ num_attention_heads=128,
130
+ num_key_value_heads=128,
131
+ n_shared_experts=1,
132
+ n_routed_experts=256,
133
+ ep_size=1,
134
+ routed_scaling_factor=2.5,
135
+ kv_lora_rank=512,
136
+ q_lora_rank=1536,
137
+ qk_rope_head_dim=64,
138
+ v_head_dim=128,
139
+ qk_nope_head_dim=128,
140
+ topk_method="noaux_tc",
141
+ n_group=8,
142
+ topk_group=4,
143
+ num_experts_per_tok=8,
144
+ moe_layer_freq=1,
145
+ first_k_dense_replace=3,
146
+ norm_topk_prob=True,
147
+ scoring_func="sigmoid",
148
+ aux_loss_alpha=0.001,
149
+ seq_aux=True,
150
+ hidden_act="silu",
151
+ max_position_embeddings=4096,
152
+ initializer_range=0.02,
153
+ rms_norm_eps=1e-6,
154
+ use_cache=True,
155
+ pad_token_id=None,
156
+ bos_token_id=0,
157
+ eos_token_id=1,
158
+ pretraining_tp=1,
159
+ tie_word_embeddings=False,
160
+ rope_theta=10000.0,
161
+ rope_scaling=None,
162
+ attention_bias=False,
163
+ attention_dropout=0.0,
164
+ **kwargs,
165
+ ):
166
+ self.vocab_size = vocab_size
167
+ self.max_position_embeddings = max_position_embeddings
168
+ self.hidden_size = hidden_size
169
+ self.intermediate_size = intermediate_size
170
+ self.moe_intermediate_size = moe_intermediate_size
171
+ self.num_hidden_layers = num_hidden_layers
172
+ self.num_nextn_predict_layers = num_nextn_predict_layers
173
+ self.num_attention_heads = num_attention_heads
174
+ self.n_shared_experts = n_shared_experts
175
+ self.n_routed_experts = n_routed_experts
176
+ self.ep_size = ep_size
177
+ self.routed_scaling_factor = routed_scaling_factor
178
+ self.kv_lora_rank = kv_lora_rank
179
+ self.q_lora_rank = q_lora_rank
180
+ self.qk_rope_head_dim = qk_rope_head_dim
181
+ self.v_head_dim = v_head_dim
182
+ self.qk_nope_head_dim = qk_nope_head_dim
183
+ self.topk_method = topk_method
184
+ self.n_group = n_group
185
+ self.topk_group = topk_group
186
+ self.num_experts_per_tok = num_experts_per_tok
187
+ self.moe_layer_freq = moe_layer_freq
188
+ self.first_k_dense_replace = first_k_dense_replace
189
+ self.norm_topk_prob = norm_topk_prob
190
+ self.scoring_func = scoring_func
191
+ self.aux_loss_alpha = aux_loss_alpha
192
+ self.seq_aux = seq_aux
193
+ # for backward compatibility
194
+ if num_key_value_heads is None:
195
+ num_key_value_heads = num_attention_heads
196
+
197
+ self.num_key_value_heads = num_key_value_heads
198
+ self.hidden_act = hidden_act
199
+ self.initializer_range = initializer_range
200
+ self.rms_norm_eps = rms_norm_eps
201
+ self.pretraining_tp = pretraining_tp
202
+ self.use_cache = use_cache
203
+ self.rope_theta = rope_theta
204
+ self.rope_scaling = rope_scaling
205
+ self.attention_bias = attention_bias
206
+ self.attention_dropout = attention_dropout
207
+
208
+ super().__init__(
209
+ pad_token_id=pad_token_id,
210
+ bos_token_id=bos_token_id,
211
+ eos_token_id=eos_token_id,
212
+ tie_word_embeddings=tie_word_embeddings,
213
+ **kwargs,
214
+ )
215
+
216
+
217
+ class MoonViTConfig(PretrainedConfig):
218
+ model_type = "moonvit"
219
+
220
+ def __init__(
221
+ self,
222
+ patch_size: int = 14,
223
+ init_pos_emb_height: int = 64,
224
+ init_pos_emb_width: int = 64,
225
+ num_attention_heads: int = 16,
226
+ num_hidden_layers: int = 27,
227
+ hidden_size: int = 1152,
228
+ intermediate_size: int = 4304,
229
+ merge_kernel_size: tuple[int, int] = (2, 2),
230
+ **kwargs,
231
+ ):
232
+ super().__init__(**kwargs)
233
+ self.patch_size = patch_size
234
+ # Positional embedding config
235
+ self.init_pos_emb_height = init_pos_emb_height
236
+ self.init_pos_emb_width = init_pos_emb_width
237
+ # Transformer config
238
+ self.num_hidden_layers = num_hidden_layers
239
+ self.num_attention_heads = num_attention_heads
240
+ self.hidden_size = hidden_size
241
+ self.intermediate_size = intermediate_size
242
+ # Patch merger config
243
+ self.merge_kernel_size = merge_kernel_size
244
+
245
+
246
+ class KimiVLConfig(PretrainedConfig):
247
+ model_type = "kimi_vl"
248
+
249
+ def __init__(
250
+ self,
251
+ vision_config: Optional[Union[dict, MoonViTConfig]] = None,
252
+ text_config: Optional[Union[dict, DeepseekV3Config]] = None,
253
+ ignore_index: int = -100,
254
+ media_placeholder_token_id: int = 163605,
255
+ pad_token_id: int = 0,
256
+ **kwargs,
257
+ ):
258
+ if vision_config is None:
259
+ vision_config = MoonViTConfig()
260
+ elif isinstance(vision_config, dict):
261
+ vision_config = MoonViTConfig(**vision_config)
262
+ self.vision_config = vision_config
263
+
264
+ if text_config is None:
265
+ text_config = DeepseekV3Config()
266
+ elif isinstance(text_config, dict):
267
+ text_config = DeepseekV3Config(**text_config)
268
+ self.text_config = text_config
269
+
270
+ self.ignore_index = ignore_index
271
+ self.media_placeholder_token_id = media_placeholder_token_id
272
+
273
+ attn_implementation = kwargs.get("attn_implementation")
274
+ if attn_implementation is not None:
275
+ if attn_implementation in ["eager", "flash_attention_2"]:
276
+ self._attn_implementation = attn_implementation
277
+ self.vision_config._attn_implementation = attn_implementation
278
+ self.text_config._attn_implementation = attn_implementation
279
+ else:
280
+ raise ValueError(
281
+ f"Invalid attention implementation: {attn_implementation}"
282
+ )
283
+
284
+ super().__init__(pad_token_id=pad_token_id, **kwargs)
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 163584,
3
+ "pad_token_id": 163838,
4
+ "eos_token_id": [
5
+ 163585
6
+ ],
7
+ "do_sample": true,
8
+ "temperature": 0.6
9
+ }
image_processing_kimi_vl.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Image processor class for KimiVL."""
2
+
3
+ import math
4
+ import numpy as np
5
+ from PIL import Image
6
+ from typing import Optional, Union
7
+
8
+ import torch
9
+ from torchvision.transforms import functional as TF
10
+ from transformers.image_utils import ImageInput, make_list_of_images, valid_images
11
+ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
12
+ from transformers.utils import TensorType
13
+
14
+
15
+ OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
16
+ OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
17
+
18
+
19
+ class KimiVLImageProcessor(BaseImageProcessor):
20
+ model_type = "kimi_vl"
21
+
22
+ def __init__(
23
+ self,
24
+ patch_size: int = 14,
25
+ pad_input: bool = False,
26
+ image_mean: tuple[float, float, float] = OPENAI_DATASET_MEAN,
27
+ image_std: tuple[float, float, float] = OPENAI_DATASET_STD,
28
+ in_token_limit: int = 4096,
29
+ merge_kernel_size: list[int, int] = [2, 2],
30
+ **kwargs,
31
+ ):
32
+ super().__init__(**kwargs)
33
+ self.in_token_limit = in_token_limit
34
+ self.patch_size = patch_size
35
+ self.pad_input = pad_input
36
+ self.image_mean = image_mean
37
+ self.image_std = image_std
38
+ self.merge_kernel_size = merge_kernel_size
39
+
40
+ def rescale(
41
+ self, image: Image.Image, merge_kernel_size: list[int, int] = [2, 2]
42
+ ) -> Image.Image:
43
+ w, h = image.size
44
+ patch_size = self.patch_size
45
+
46
+ if (w // patch_size) * (h // patch_size) > self.in_token_limit:
47
+ scale = math.sqrt(self.in_token_limit / ((w // patch_size) * (h // patch_size)))
48
+ new_w, new_h = int(w * scale), int(h * scale)
49
+ image = image.resize((new_w, new_h), Image.Resampling.BICUBIC)
50
+ if self.pad_input:
51
+ new_w, new_h = image.size
52
+ pad_size_h = merge_kernel_size[0] * patch_size
53
+ pad_size_w = merge_kernel_size[1] * patch_size
54
+
55
+ pad_h = (pad_size_h - new_h % pad_size_h) % pad_size_h
56
+ pad_w = (pad_size_w - new_w % pad_size_w) % pad_size_w
57
+
58
+ image = TF.pad(image, (0, 0, pad_w, pad_h))
59
+ else:
60
+ new_w, new_h = image.size
61
+ new_w = new_w - new_w % patch_size
62
+ new_h = new_h - new_h % patch_size
63
+ image = TF.center_crop(image, (new_h, new_w))
64
+
65
+ w, h = image.size
66
+ if w // patch_size >= 512 or h // patch_size >= 512:
67
+ raise ValueError("Exceed pos emb")
68
+
69
+ return image
70
+
71
+ def to_tensor(self, image: Image.Image) -> torch.Tensor:
72
+ return TF.to_tensor(image.convert("RGB"))
73
+
74
+ def normalize(self, image: torch.Tensor) -> torch.Tensor:
75
+ return TF.normalize(image, self.image_mean, self.image_std)
76
+
77
+ def patchify(self, image: torch.Tensor) -> tuple[torch.Tensor, list[int, int]]:
78
+ patch_size = self.patch_size
79
+ C, H, W = image.shape
80
+ patches = image.reshape(C, H // patch_size, patch_size, W // patch_size, patch_size)
81
+ patches = patches.permute(1, 3, 0, 2, 4)
82
+ patches = patches.contiguous().view(-1, C, patch_size, patch_size)
83
+ grid_hw = (H // patch_size, W // patch_size)
84
+ return patches, grid_hw
85
+
86
+ def _preprocess(self, image: ImageInput) -> tuple[torch.Tensor, list[int, int]]:
87
+ """
88
+ Preprocess image and patchify it.
89
+
90
+ Args:
91
+ image (`ImageInput`):
92
+ Image to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
93
+
94
+ Returns:
95
+ patches: torch.Tensor
96
+ grid_hw: list[int, int]
97
+ """
98
+ image = self.rescale(image, self.merge_kernel_size)
99
+ image = self.to_tensor(image)
100
+ image = self.normalize(image)
101
+ patches, grid_hw = self.patchify(image)
102
+ return patches, grid_hw
103
+
104
+ def preprocess(
105
+ self,
106
+ images: ImageInput,
107
+ return_tensors: Optional[Union[str, TensorType]] = None,
108
+ ) -> BatchFeature:
109
+ images = make_list_of_images(images)
110
+
111
+ if not valid_images(images):
112
+ raise ValueError(
113
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
114
+ "torch.Tensor, tf.Tensor or jax.ndarray."
115
+ )
116
+
117
+ pixel_values, image_grid_hws = [], []
118
+ for image in images:
119
+ patches, image_grid_hw = self._preprocess(image)
120
+ pixel_values.append(patches)
121
+ image_grid_hws.append(image_grid_hw)
122
+ pixel_values = torch.concat(pixel_values, dim=0)
123
+ image_grid_hws = np.array(image_grid_hws)
124
+ data = {"pixel_values": pixel_values, "image_grid_hws": image_grid_hws}
125
+
126
+ return BatchFeature(data=data, tensor_type=return_tensors)
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fa86aea5779049caa0130dfddeb0785b08b9624d1f0e161ef185777c31c8776
3
+ size 5218358658
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a8a7250a9c6648d327bea9499c2e0e29ea861b604ca978489db2903f5d46545
3
+ size 5364694755
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c333f73cedd425473fbc35a1bb308d6349c18048df0be19858b371d36e78b33
3
+ size 5201366245
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b0bdcf220e4b86ed2e8a6f365427c2e8643a965aa91f912ef07228af500e78a
3
+ size 2043016472
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling_kimi_vl.py ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "image_processing_kimi_vl.KimiVLImageProcessor",
4
+ "AutoProcessor": "processing_kimi_vl.KimiVLProcessor"
5
+ },
6
+ "image_mean": [
7
+ 0.5,
8
+ 0.5,
9
+ 0.5
10
+ ],
11
+ "image_processor_type": "KimiVLImageProcessor",
12
+ "image_std": [
13
+ 0.5,
14
+ 0.5,
15
+ 0.5
16
+ ],
17
+ "in_token_limit": 16384,
18
+ "merge_kernel_size": [
19
+ 2,
20
+ 2
21
+ ],
22
+ "num_pooled_tokens": 1024,
23
+ "pad_input": true,
24
+ "patch_size": 14,
25
+ "processor_class": "KimiVLProcessor"
26
+ }
processing_kimi_vl.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The Moonshot Team and HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # The code is based on the Qwen2VL processor (qwen2_vl/processing_qwen2_vl.py), but modified for KimiVL.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+ """
18
+ Processor class for KimiVL.
19
+ """
20
+
21
+ from typing import List, Union
22
+
23
+ from transformers.feature_extraction_utils import BatchFeature
24
+ from transformers.image_utils import ImageInput
25
+ from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
26
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
27
+ from transformers.utils import logging
28
+
29
+
30
+ logger = logging.get_logger(__name__)
31
+
32
+
33
+ class KimiVLProcessorKwargs(ProcessingKwargs, total=False):
34
+ _defaults = {
35
+ "text_kwargs": {
36
+ "padding": False,
37
+ },
38
+ "images_kwargs": {},
39
+ }
40
+
41
+
42
+ class KimiVLProcessor(ProcessorMixin):
43
+ r"""
44
+ Constructs a KimiVL processor which wraps a KimiVL image processor and a tokenizer into a single processor.
45
+
46
+ [`KimiVLProcessor`] offers all the functionalities of [`KimiVLImageProcessor`] and [`TikTokenTokenizer`]. See the
47
+ [`~KimiVLProcessor.__call__`] and [`~KimiVLProcessor.decode`] for more information.
48
+
49
+ Args:
50
+ image_processor ([`KimiVLImageProcessor`], *optional*):
51
+ The image processor is a required input.
52
+ tokenizer ([`TikTokenTokenizer`], *optional*):
53
+ The tokenizer is a required input.
54
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
55
+ in a chat into a tokenizable string.
56
+ """
57
+
58
+ attributes = ["image_processor", "tokenizer"]
59
+ valid_kwargs = [ "chat_template"]
60
+ image_processor_class = "AutoImageProcessor"
61
+ tokenizer_class = "AutoTokenizer"
62
+
63
+ def __init__(
64
+ self,
65
+ image_processor=None,
66
+ tokenizer=None,
67
+ chat_template=None,
68
+ **kwargs,
69
+ ):
70
+ self.image_token = "<|media_pad|>"
71
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
72
+
73
+ def __call__(
74
+ self,
75
+ images: ImageInput = None,
76
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
77
+ **kwargs: Unpack[KimiVLProcessorKwargs],
78
+ ) -> BatchFeature:
79
+ """
80
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
81
+ and `kwargs` arguments to TikTokenTokenizer's [`~TikTokenTokenizer.__call__`] if `text` is not `None` to encode
82
+ the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
83
+ CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
84
+ of the above two methods for more information.
85
+
86
+ Args:
87
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
88
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
89
+ tensor. Both channels-first and channels-last formats are supported.
90
+ text (`str`, `List[str]`, `List[List[str]]`):
91
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
92
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
93
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
94
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
95
+ If set, will return tensors of a particular framework. Acceptable values are:
96
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
97
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
98
+ - `'np'`: Return NumPy `np.ndarray` objects.
99
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
100
+
101
+ Returns:
102
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
103
+
104
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
105
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
106
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
107
+ `None`).
108
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
109
+ """
110
+ if images is None and text is None:
111
+ raise ValueError("You have to specify at least one of `images` or `text`.")
112
+
113
+ # check if images and text inputs are reversed for BC
114
+ images, text = _validate_images_text_input_order(images, text)
115
+
116
+ output_kwargs = self._merge_kwargs(
117
+ KimiVLProcessorKwargs,
118
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
119
+ **kwargs,
120
+ )
121
+ if images is not None:
122
+ image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
123
+ image_grid_hws = image_inputs["image_grid_hws"]
124
+ else:
125
+ image_inputs = {}
126
+ image_grid_hws = None
127
+
128
+ if isinstance(text, str):
129
+ text = [text]
130
+ elif not isinstance(text, list) and not isinstance(text[0], str):
131
+ raise ValueError("Invalid input text. Please provide a string, or a list of strings")
132
+
133
+ if image_grid_hws is not None:
134
+ merge_length = self.image_processor.merge_kernel_size[0] * self.image_processor.merge_kernel_size[1]
135
+ index = 0
136
+ for i in range(len(text)):
137
+ while self.image_token in text[i]:
138
+ text[i] = text[i].replace(
139
+ self.image_token,
140
+ "<|placeholder|>" * (image_grid_hws[index].prod() // merge_length),
141
+ 1,
142
+ )
143
+ index += 1
144
+ text[i] = text[i].replace("<|placeholder|>", self.image_token)
145
+
146
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
147
+ return BatchFeature(data={**text_inputs, **image_inputs})
148
+
149
+ def batch_decode(self, *args, **kwargs):
150
+ """
151
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
152
+ refer to the docstring of this method for more information.
153
+ """
154
+ return self.tokenizer.batch_decode(*args, **kwargs)
155
+
156
+ def decode(self, *args, **kwargs):
157
+ """
158
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
159
+ the docstring of this method for more information.
160
+ """
161
+ return self.tokenizer.decode(*args, **kwargs)
162
+
163
+ @property
164
+ def model_input_names(self):
165
+ tokenizer_input_names = self.tokenizer.model_input_names
166
+ image_processor_input_names = self.image_processor.model_input_names
167
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
168
+
169
+
170
+ __all__ = ["KimiVLProcessorKwargs"]
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_kimi_vl.KimiVLProcessor"
4
+ },
5
+ "processor_class": "KimiVLProcessor"
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_end|>",
4
+ "<|im_user|>",
5
+ "<|im_assistant|>",
6
+ "<|im_system|>",
7
+ "<|im_middle|>",
8
+ "<|media_start|>",
9
+ "<|media_content|>",
10
+ "<|media_end|>",
11
+ "<|media_pad|>"
12
+ ],
13
+ "bos_token": {
14
+ "content": "[BOS]",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "eos_token": {
21
+ "content": "[EOS]",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "pad_token": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ },
34
+ "unk_token": {
35
+ "content": "[UNK]",
36
+ "lstrip": false,
37
+ "normalized": false,
38
+ "rstrip": false,
39
+ "single_word": false
40
+ }
41
+ }
tiktoken.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6c497a7469b33ced9c38afb1ad6e47f03f5e5dc05f15930799210ec050c5103
3
+ size 2795286
tokenization_moonshot.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tiktoken
3
+
4
+ from logging import getLogger
5
+ from pathlib import Path
6
+ from typing import (
7
+ cast,
8
+ Tuple,
9
+ Dict,
10
+ Iterator,
11
+ List,
12
+ Union,
13
+ Optional,
14
+ )
15
+ from shutil import copyfile
16
+ from tiktoken.load import load_tiktoken_bpe
17
+ from tokenizers import AddedToken
18
+ from transformers.tokenization_utils import PreTrainedTokenizer
19
+ from transformers.utils import to_py_obj
20
+ from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
21
+
22
+
23
+ logger = getLogger(__name__)
24
+ VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
25
+ SPIECE_UNDERLINE = "▁"
26
+
27
+
28
+ class TikTokenTokenizer(PreTrainedTokenizer):
29
+ """
30
+ Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
31
+
32
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
33
+ this superclass for more information regarding those methods.
34
+
35
+ Args:
36
+ vocab_file (`str`):
37
+ The path to the Tiktoken model file.
38
+ bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
39
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
40
+ eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
41
+ The end of sequence token.
42
+ unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
43
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
44
+ token instead. The second to last item in special_tokens.
45
+ pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
46
+ The token used for padding, for example when batching sequences of different lengths.
47
+ additional_special_tokens (list of `str`, *optional*):
48
+ A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
49
+ skipped when decoding if `skip_special_tokens` is set to `True`.
50
+ """
51
+
52
+ vocab_files_names = VOCAB_FILES_NAMES
53
+
54
+ model_input_names = ["input_ids", "attention_mask"]
55
+
56
+ special_tokens: Dict[str, int]
57
+
58
+ num_reserved_special_tokens = 256
59
+
60
+ pat_str = "|".join(
61
+ [
62
+ r"""[\p{Han}]+""",
63
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
64
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
65
+ r"""\p{N}{1,3}""",
66
+ r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
67
+ r"""\s*[\r\n]+""",
68
+ r"""\s+(?!\S)""",
69
+ r"""\s+""",
70
+ ]
71
+ )
72
+
73
+ def __init__(
74
+ self,
75
+ vocab_file,
76
+ bos_token: Union[str, AddedToken] = "[BOS]",
77
+ eos_token: Union[str, AddedToken] = "[EOS]",
78
+ unk_token: Union[str, AddedToken] = "[UNK]",
79
+ pad_token: Union[str, AddedToken] = "[PAD]",
80
+ additional_special_tokens: Optional[List[str]] = None,
81
+ added_tokens_decoder: Optional[dict] = None,
82
+ **kwargs,
83
+ ):
84
+ assert os.path.isfile(vocab_file), vocab_file
85
+ if additional_special_tokens is None:
86
+ additional_special_tokens = [
87
+ "<|im_end|>",
88
+ "<|im_middle|>",
89
+ "<|im_user|>",
90
+ "<|im_assistant|>",
91
+ "<|im_system|>",
92
+ ]
93
+ special_tokens_mapping = {
94
+ i: added_tokens_decoder[i].content for i in added_tokens_decoder
95
+ }
96
+
97
+ self.vocab_file = vocab_file
98
+ mergeable_ranks = load_tiktoken_bpe(vocab_file)
99
+ num_base_tokens = len(mergeable_ranks)
100
+ self.special_tokens = {
101
+ special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
102
+ for i in range(
103
+ num_base_tokens, num_base_tokens + self.num_reserved_special_tokens + 2
104
+ )
105
+ }
106
+
107
+ self.model = tiktoken.Encoding(
108
+ name=Path(vocab_file).name,
109
+ pat_str=self.pat_str,
110
+ mergeable_ranks=mergeable_ranks,
111
+ special_tokens=self.special_tokens,
112
+ )
113
+
114
+ self.n_words: int = self.model.n_vocab
115
+ # BOS / EOS token IDs
116
+ self.bos_id: int = self.special_tokens[str(bos_token)]
117
+ self.eos_id: int = self.special_tokens[str(eos_token)]
118
+
119
+ self.pad_id: int = self.special_tokens[str(pad_token)]
120
+ self.unk_id: int = self.special_tokens[str(unk_token)]
121
+
122
+ self.byte_encoder = bytes_to_unicode()
123
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
124
+
125
+ self.decoder = {}
126
+ for i in range(self.n_words):
127
+ # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
128
+ decoding = "".join(
129
+ [
130
+ self.byte_encoder[ord(char)]
131
+ for char in self.model.decode_single_token_bytes(i).decode(
132
+ "latin-1"
133
+ )
134
+ ]
135
+ )
136
+ self.decoder[i] = decoding
137
+
138
+ self.encoder = {}
139
+ for i in range(self.n_words):
140
+ if i in self.decoder:
141
+ self.encoder[self.decoder[i]] = i
142
+
143
+ super().__init__(
144
+ bos_token=bos_token,
145
+ eos_token=eos_token,
146
+ unk_token=unk_token,
147
+ pad_token=pad_token,
148
+ additional_special_tokens=additional_special_tokens,
149
+ **kwargs,
150
+ )
151
+ self.all_special_ids_set = set(self.all_special_ids)
152
+
153
+ def encode(
154
+ self, text: str, allow_special_tokens: bool = True, **kwargs
155
+ ) -> List[int]:
156
+ """
157
+ Encodes a string into a list of token IDs.
158
+
159
+ Args:
160
+ text (str): The input string to be encoded.
161
+
162
+ Returns:
163
+ list[int]: A list of token IDs.
164
+ """
165
+ # If there are other args, we should call super().encode because there are a lot of code
166
+ # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
167
+ if len(kwargs) > 0:
168
+ return super().encode(text, **kwargs)
169
+
170
+ assert type(text) is str
171
+
172
+ # The tiktoken tokenizer can handle <=400k chars without
173
+ # pyo3_runtime.PanicException.
174
+ TIKTOKEN_MAX_ENCODE_CHARS = 400_000
175
+
176
+ # https://github.com/openai/tiktoken/issues/195
177
+ # Here we iterate over subsequences and split if we exceed the limit
178
+ # of max consecutive non-whitespace or whitespace characters.
179
+ MAX_NO_WHITESPACES_CHARS = 25_000
180
+
181
+ substrs = (
182
+ substr
183
+ for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
184
+ for substr in self._split_whitespaces_or_nonwhitespaces(
185
+ text[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
186
+ )
187
+ )
188
+ t: List[int] = []
189
+ for substr in substrs:
190
+ if allow_special_tokens:
191
+ t.extend(
192
+ # we should consider special token as a common token
193
+ self.model.encode(
194
+ substr,
195
+ allowed_special="all",
196
+ )
197
+ )
198
+ else:
199
+ t.extend(
200
+ # we should consider special token as a common token
201
+ self.model.encode(
202
+ substr,
203
+ disallowed_special=(),
204
+ )
205
+ )
206
+ return t
207
+
208
+ def decode(self, token_ids: Union[int, List[int]], **kwargs) -> str:
209
+ """
210
+ Decodes a list of token IDs into a string.
211
+
212
+ Args:
213
+ t (List[int]): The list of token IDs to be decoded.
214
+
215
+ Returns:
216
+ str: The decoded string.
217
+ """
218
+ # If there are other args, we should call super().decode because there are a lot of code
219
+ # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
220
+ if len(kwargs) > 0:
221
+ return super().decode(token_ids, **kwargs)
222
+
223
+ token_ids = to_py_obj(token_ids)
224
+
225
+ if type(token_ids) is int:
226
+ token_ids = [token_ids]
227
+
228
+ return self.model.decode(cast(List[int], token_ids))
229
+
230
+ @staticmethod
231
+ def _split_whitespaces_or_nonwhitespaces(
232
+ s: str, max_consecutive_slice_len: int
233
+ ) -> Iterator[str]:
234
+ """
235
+ Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
236
+ consecutive whitespaces or consecutive non-whitespaces.
237
+ """
238
+ current_slice_len = 0
239
+ current_slice_is_space = s[0].isspace() if len(s) > 0 else False
240
+ slice_start = 0
241
+
242
+ for i in range(len(s)):
243
+ is_now_space = s[i].isspace()
244
+
245
+ if current_slice_is_space ^ is_now_space:
246
+ current_slice_len = 1
247
+ current_slice_is_space = is_now_space
248
+ else:
249
+ current_slice_len += 1
250
+ if current_slice_len > max_consecutive_slice_len:
251
+ yield s[slice_start:i]
252
+ slice_start = i
253
+ current_slice_len = 1
254
+ yield s[slice_start:]
255
+
256
+ """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
257
+
258
+ @property
259
+ def vocab_size(self) -> int:
260
+ return self.n_words
261
+
262
+ def get_vocab(self) -> Dict[str, int]:
263
+ return self.encoder
264
+
265
+ def _tokenize(self, text: str, **kwargs) -> List[str]:
266
+ return [self.decoder[t] for t in self.encode(text)]
267
+
268
+ def _convert_token_to_id(self, token: str) -> int:
269
+ return self.encoder.get(token, self.unk_id)
270
+
271
+ def _convert_id_to_token(self, index: int) -> str:
272
+ return self.decoder.get(index)
273
+
274
+ @staticmethod
275
+ def clean_up_tokenization(out_string: str) -> str:
276
+ return out_string
277
+
278
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
279
+ text = "".join(tokens).replace(SPIECE_UNDERLINE, "")
280
+ text = bytearray([self.byte_decoder[c] for c in text]).decode(
281
+ "utf-8", "replace"
282
+ )
283
+ return text
284
+
285
+ def save_vocabulary(
286
+ self, save_directory: str, filename_prefix: Optional[str] = None
287
+ ) -> Tuple[str]:
288
+ if not os.path.isdir(save_directory):
289
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
290
+ return
291
+ out_vocab_file = os.path.join(
292
+ save_directory,
293
+ (filename_prefix + "-" if filename_prefix else "")
294
+ + VOCAB_FILES_NAMES["vocab_file"],
295
+ )
296
+
297
+ if os.path.abspath(self.vocab_file) != os.path.abspath(
298
+ out_vocab_file
299
+ ) and os.path.isfile(self.vocab_file):
300
+ copyfile(self.vocab_file, out_vocab_file)
301
+
302
+ return (out_vocab_file,)
tokenizer_config.json ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "163584": {
4
+ "content": "[BOS]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "163585": {
12
+ "content": "[EOS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "163586": {
20
+ "content": "<|im_end|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "163587": {
28
+ "content": "<|im_user|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "163588": {
36
+ "content": "<|im_assistant|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "163594": {
44
+ "content": "<|im_system|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "163601": {
52
+ "content": "<|im_middle|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "163602": {
60
+ "content": "<|media_start|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "163603": {
68
+ "content": "<|media_content|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "163604": {
76
+ "content": "<|media_end|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "163605": {
84
+ "content": "<|media_pad|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "163838": {
92
+ "content": "[PAD]",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "163839": {
100
+ "content": "[UNK]",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ }
107
+ },
108
+ "additional_special_tokens": [
109
+ "<|im_end|>",
110
+ "<|im_user|>",
111
+ "<|im_assistant|>",
112
+ "<|im_system|>",
113
+ "<|im_middle|>",
114
+ "<|media_start|>",
115
+ "<|media_content|>",
116
+ "<|media_end|>",
117
+ "<|media_pad|>"
118
+ ],
119
+ "auto_map": {
120
+ "AutoProcessor": "processing_kimi_vl.KimiVLProcessor",
121
+ "AutoTokenizer": [
122
+ "tokenization_moonshot.TikTokenTokenizer",
123
+ null
124
+ ]
125
+ },
126
+ "bos_token": "[BOS]",
127
+ "clean_up_tokenization_spaces": false,
128
+ "eos_token": "[EOS]",
129
+ "extra_special_tokens": {},
130
+ "model_max_length": 1048576,
131
+ "pad_token": "[PAD]",
132
+ "processor_class": "KimiVLProcessor",
133
+ "tokenizer_class": "TikTokenTokenizer",
134
+ "unk_token": "[UNK]"
135
+ }