n1ck-guo commited on
Commit
c7ff1cf
·
verified ·
1 Parent(s): 0df3bbd

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. chat_template.jinja +3 -0
  3. config.json +1014 -0
  4. configuration_deepseek.py +199 -0
  5. generation_config.json +9 -0
  6. model-00001-of-00072.safetensors +3 -0
  7. model-00002-of-00072.safetensors +3 -0
  8. model-00003-of-00072.safetensors +3 -0
  9. model-00004-of-00072.safetensors +3 -0
  10. model-00005-of-00072.safetensors +3 -0
  11. model-00006-of-00072.safetensors +3 -0
  12. model-00007-of-00072.safetensors +3 -0
  13. model-00008-of-00072.safetensors +3 -0
  14. model-00009-of-00072.safetensors +3 -0
  15. model-00010-of-00072.safetensors +3 -0
  16. model-00011-of-00072.safetensors +3 -0
  17. model-00012-of-00072.safetensors +3 -0
  18. model-00013-of-00072.safetensors +3 -0
  19. model-00014-of-00072.safetensors +3 -0
  20. model-00015-of-00072.safetensors +3 -0
  21. model-00016-of-00072.safetensors +3 -0
  22. model-00017-of-00072.safetensors +3 -0
  23. model-00018-of-00072.safetensors +3 -0
  24. model-00019-of-00072.safetensors +3 -0
  25. model-00020-of-00072.safetensors +3 -0
  26. model-00021-of-00072.safetensors +3 -0
  27. model-00022-of-00072.safetensors +3 -0
  28. model-00023-of-00072.safetensors +3 -0
  29. model-00024-of-00072.safetensors +3 -0
  30. model-00025-of-00072.safetensors +3 -0
  31. model-00026-of-00072.safetensors +3 -0
  32. model-00027-of-00072.safetensors +3 -0
  33. model-00028-of-00072.safetensors +3 -0
  34. model-00029-of-00072.safetensors +3 -0
  35. model-00030-of-00072.safetensors +3 -0
  36. model-00031-of-00072.safetensors +3 -0
  37. model-00032-of-00072.safetensors +3 -0
  38. model-00033-of-00072.safetensors +3 -0
  39. model-00034-of-00072.safetensors +3 -0
  40. model-00035-of-00072.safetensors +3 -0
  41. model-00036-of-00072.safetensors +3 -0
  42. model-00037-of-00072.safetensors +3 -0
  43. model-00038-of-00072.safetensors +3 -0
  44. model-00039-of-00072.safetensors +3 -0
  45. model-00040-of-00072.safetensors +3 -0
  46. model-00041-of-00072.safetensors +3 -0
  47. model-00042-of-00072.safetensors +3 -0
  48. model-00043-of-00072.safetensors +3 -0
  49. model-00044-of-00072.safetensors +3 -0
  50. model-00045-of-00072.safetensors +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model.safetensors.index.json filter=lfs diff=lfs merge=lfs -text
chat_template.jinja ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if not thinking is defined %}{% set thinking = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, system_prompt='', is_first_sp=true, is_last_user=false) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '
2
+
3
+ ' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{%- set ns.is_first = false -%}{%- set ns.is_last_user = true -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %}{%- if ns.is_last_user %}{{'<|Assistant|></think>'}}{%- endif %}{%- set ns.is_last_user = false -%}{%- set ns.is_first = false %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<|tool▁calls▁begin|><|tool▁call▁begin|>'+ tool['function']['name'] + '<|tool▁sep|>' + tool['function']['arguments'] + '<|tool▁call▁end|>'}}{%- else %}{{message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['function']['name'] + '<|tool▁sep|>' + tool['function']['arguments'] + '<|tool▁call▁end|>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'<|tool▁call▁begin|>'+ tool['function']['name'] + '<|tool▁sep|>' + tool['function']['arguments'] + '<|tool▁call▁end|>'}}{%- endif %}{%- endfor %}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none) %}{%- if ns.is_last_user %}{{'<|Assistant|>'}}{%- if message['prefix'] is defined and message['prefix'] and thinking %}{{'<think>'}} {%- else %}{{'</think>'}}{%- endif %}{%- endif %}{%- set ns.is_last_user = false -%}{%- if ns.is_tool %}{{message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{%- set content = message['content'] -%}{%- if '</think>' in content %}{%- set content = content.split('</think>', 1)[1] -%}{%- endif %}{{content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_last_user = false -%}{%- set ns.is_tool = true -%}{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endfor -%}{%- if add_generation_prompt and ns.is_last_user and not ns.is_tool %}{{'<|Assistant|>'}}{%- if not thinking %}{{'</think>'}}{%- else %}{{'<think>'}}{%- endif %}{% endif %}
config.json ADDED
@@ -0,0 +1,1014 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DeepseekV3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_deepseek.DeepseekV3Config",
9
+ "AutoModel": "modeling_deepseek.DeepseekV3Model",
10
+ "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
11
+ },
12
+ "bos_token_id": 0,
13
+ "eos_token_id": 1,
14
+ "ep_size": 1,
15
+ "first_k_dense_replace": 3,
16
+ "hidden_act": "silu",
17
+ "hidden_size": 7168,
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 18432,
20
+ "kv_lora_rank": 512,
21
+ "max_position_embeddings": 163840,
22
+ "model_type": "deepseek_v3",
23
+ "moe_intermediate_size": 2048,
24
+ "moe_layer_freq": 1,
25
+ "n_group": 8,
26
+ "n_routed_experts": 256,
27
+ "n_shared_experts": 1,
28
+ "norm_topk_prob": true,
29
+ "num_attention_heads": 128,
30
+ "num_experts_per_tok": 8,
31
+ "num_hidden_layers": 61,
32
+ "num_key_value_heads": 128,
33
+ "num_nextn_predict_layers": 1,
34
+ "pad_token_id": 2,
35
+ "q_lora_rank": 1536,
36
+ "qk_nope_head_dim": 128,
37
+ "qk_rope_head_dim": 64,
38
+ "quantization_config": {
39
+ "autoround_version": "0.6.1.dev",
40
+ "bits": 4,
41
+ "data_type": "int",
42
+ "extra_config": {
43
+ "model.layers.0.mlp.down_proj": {
44
+ "bits": 8
45
+ },
46
+ "model.layers.0.mlp.gate_proj": {
47
+ "bits": 8
48
+ },
49
+ "model.layers.0.mlp.up_proj": {
50
+ "bits": 8
51
+ },
52
+ "model.layers.0.self_attn.kv_a_proj_with_mqa": {
53
+ "bits": 8
54
+ },
55
+ "model.layers.0.self_attn.kv_b_proj": {
56
+ "bits": 8
57
+ },
58
+ "model.layers.0.self_attn.o_proj": {
59
+ "bits": 8
60
+ },
61
+ "model.layers.0.self_attn.q_a_proj": {
62
+ "bits": 8
63
+ },
64
+ "model.layers.0.self_attn.q_b_proj": {
65
+ "bits": 8
66
+ },
67
+ "model.layers.1.mlp.down_proj": {
68
+ "bits": 8
69
+ },
70
+ "model.layers.1.mlp.gate_proj": {
71
+ "bits": 8
72
+ },
73
+ "model.layers.1.mlp.up_proj": {
74
+ "bits": 8
75
+ },
76
+ "model.layers.1.self_attn.kv_a_proj_with_mqa": {
77
+ "bits": 8
78
+ },
79
+ "model.layers.1.self_attn.kv_b_proj": {
80
+ "bits": 8
81
+ },
82
+ "model.layers.1.self_attn.o_proj": {
83
+ "bits": 8
84
+ },
85
+ "model.layers.1.self_attn.q_a_proj": {
86
+ "bits": 8
87
+ },
88
+ "model.layers.1.self_attn.q_b_proj": {
89
+ "bits": 8
90
+ },
91
+ "model.layers.10.self_attn.kv_a_proj_with_mqa": {
92
+ "bits": 8
93
+ },
94
+ "model.layers.10.self_attn.kv_b_proj": {
95
+ "bits": 8
96
+ },
97
+ "model.layers.10.self_attn.o_proj": {
98
+ "bits": 8
99
+ },
100
+ "model.layers.10.self_attn.q_a_proj": {
101
+ "bits": 8
102
+ },
103
+ "model.layers.10.self_attn.q_b_proj": {
104
+ "bits": 8
105
+ },
106
+ "model.layers.11.self_attn.kv_a_proj_with_mqa": {
107
+ "bits": 8
108
+ },
109
+ "model.layers.11.self_attn.kv_b_proj": {
110
+ "bits": 8
111
+ },
112
+ "model.layers.11.self_attn.o_proj": {
113
+ "bits": 8
114
+ },
115
+ "model.layers.11.self_attn.q_a_proj": {
116
+ "bits": 8
117
+ },
118
+ "model.layers.11.self_attn.q_b_proj": {
119
+ "bits": 8
120
+ },
121
+ "model.layers.12.self_attn.kv_a_proj_with_mqa": {
122
+ "bits": 8
123
+ },
124
+ "model.layers.12.self_attn.kv_b_proj": {
125
+ "bits": 8
126
+ },
127
+ "model.layers.12.self_attn.o_proj": {
128
+ "bits": 8
129
+ },
130
+ "model.layers.12.self_attn.q_a_proj": {
131
+ "bits": 8
132
+ },
133
+ "model.layers.12.self_attn.q_b_proj": {
134
+ "bits": 8
135
+ },
136
+ "model.layers.13.self_attn.kv_a_proj_with_mqa": {
137
+ "bits": 8
138
+ },
139
+ "model.layers.13.self_attn.kv_b_proj": {
140
+ "bits": 8
141
+ },
142
+ "model.layers.13.self_attn.o_proj": {
143
+ "bits": 8
144
+ },
145
+ "model.layers.13.self_attn.q_a_proj": {
146
+ "bits": 8
147
+ },
148
+ "model.layers.13.self_attn.q_b_proj": {
149
+ "bits": 8
150
+ },
151
+ "model.layers.14.self_attn.kv_a_proj_with_mqa": {
152
+ "bits": 8
153
+ },
154
+ "model.layers.14.self_attn.kv_b_proj": {
155
+ "bits": 8
156
+ },
157
+ "model.layers.14.self_attn.o_proj": {
158
+ "bits": 8
159
+ },
160
+ "model.layers.14.self_attn.q_a_proj": {
161
+ "bits": 8
162
+ },
163
+ "model.layers.14.self_attn.q_b_proj": {
164
+ "bits": 8
165
+ },
166
+ "model.layers.15.self_attn.kv_a_proj_with_mqa": {
167
+ "bits": 8
168
+ },
169
+ "model.layers.15.self_attn.kv_b_proj": {
170
+ "bits": 8
171
+ },
172
+ "model.layers.15.self_attn.o_proj": {
173
+ "bits": 8
174
+ },
175
+ "model.layers.15.self_attn.q_a_proj": {
176
+ "bits": 8
177
+ },
178
+ "model.layers.15.self_attn.q_b_proj": {
179
+ "bits": 8
180
+ },
181
+ "model.layers.16.self_attn.kv_a_proj_with_mqa": {
182
+ "bits": 8
183
+ },
184
+ "model.layers.16.self_attn.kv_b_proj": {
185
+ "bits": 8
186
+ },
187
+ "model.layers.16.self_attn.o_proj": {
188
+ "bits": 8
189
+ },
190
+ "model.layers.16.self_attn.q_a_proj": {
191
+ "bits": 8
192
+ },
193
+ "model.layers.16.self_attn.q_b_proj": {
194
+ "bits": 8
195
+ },
196
+ "model.layers.17.self_attn.kv_a_proj_with_mqa": {
197
+ "bits": 8
198
+ },
199
+ "model.layers.17.self_attn.kv_b_proj": {
200
+ "bits": 8
201
+ },
202
+ "model.layers.17.self_attn.o_proj": {
203
+ "bits": 8
204
+ },
205
+ "model.layers.17.self_attn.q_a_proj": {
206
+ "bits": 8
207
+ },
208
+ "model.layers.17.self_attn.q_b_proj": {
209
+ "bits": 8
210
+ },
211
+ "model.layers.18.self_attn.kv_a_proj_with_mqa": {
212
+ "bits": 8
213
+ },
214
+ "model.layers.18.self_attn.kv_b_proj": {
215
+ "bits": 8
216
+ },
217
+ "model.layers.18.self_attn.o_proj": {
218
+ "bits": 8
219
+ },
220
+ "model.layers.18.self_attn.q_a_proj": {
221
+ "bits": 8
222
+ },
223
+ "model.layers.18.self_attn.q_b_proj": {
224
+ "bits": 8
225
+ },
226
+ "model.layers.19.self_attn.kv_a_proj_with_mqa": {
227
+ "bits": 8
228
+ },
229
+ "model.layers.19.self_attn.kv_b_proj": {
230
+ "bits": 8
231
+ },
232
+ "model.layers.19.self_attn.o_proj": {
233
+ "bits": 8
234
+ },
235
+ "model.layers.19.self_attn.q_a_proj": {
236
+ "bits": 8
237
+ },
238
+ "model.layers.19.self_attn.q_b_proj": {
239
+ "bits": 8
240
+ },
241
+ "model.layers.2.mlp.down_proj": {
242
+ "bits": 8
243
+ },
244
+ "model.layers.2.mlp.gate_proj": {
245
+ "bits": 8
246
+ },
247
+ "model.layers.2.mlp.up_proj": {
248
+ "bits": 8
249
+ },
250
+ "model.layers.2.self_attn.kv_a_proj_with_mqa": {
251
+ "bits": 8
252
+ },
253
+ "model.layers.2.self_attn.kv_b_proj": {
254
+ "bits": 8
255
+ },
256
+ "model.layers.2.self_attn.o_proj": {
257
+ "bits": 8
258
+ },
259
+ "model.layers.2.self_attn.q_a_proj": {
260
+ "bits": 8
261
+ },
262
+ "model.layers.2.self_attn.q_b_proj": {
263
+ "bits": 8
264
+ },
265
+ "model.layers.20.self_attn.kv_a_proj_with_mqa": {
266
+ "bits": 8
267
+ },
268
+ "model.layers.20.self_attn.kv_b_proj": {
269
+ "bits": 8
270
+ },
271
+ "model.layers.20.self_attn.o_proj": {
272
+ "bits": 8
273
+ },
274
+ "model.layers.20.self_attn.q_a_proj": {
275
+ "bits": 8
276
+ },
277
+ "model.layers.20.self_attn.q_b_proj": {
278
+ "bits": 8
279
+ },
280
+ "model.layers.21.self_attn.kv_a_proj_with_mqa": {
281
+ "bits": 8
282
+ },
283
+ "model.layers.21.self_attn.kv_b_proj": {
284
+ "bits": 8
285
+ },
286
+ "model.layers.21.self_attn.o_proj": {
287
+ "bits": 8
288
+ },
289
+ "model.layers.21.self_attn.q_a_proj": {
290
+ "bits": 8
291
+ },
292
+ "model.layers.21.self_attn.q_b_proj": {
293
+ "bits": 8
294
+ },
295
+ "model.layers.22.self_attn.kv_a_proj_with_mqa": {
296
+ "bits": 8
297
+ },
298
+ "model.layers.22.self_attn.kv_b_proj": {
299
+ "bits": 8
300
+ },
301
+ "model.layers.22.self_attn.o_proj": {
302
+ "bits": 8
303
+ },
304
+ "model.layers.22.self_attn.q_a_proj": {
305
+ "bits": 8
306
+ },
307
+ "model.layers.22.self_attn.q_b_proj": {
308
+ "bits": 8
309
+ },
310
+ "model.layers.23.self_attn.kv_a_proj_with_mqa": {
311
+ "bits": 8
312
+ },
313
+ "model.layers.23.self_attn.kv_b_proj": {
314
+ "bits": 8
315
+ },
316
+ "model.layers.23.self_attn.o_proj": {
317
+ "bits": 8
318
+ },
319
+ "model.layers.23.self_attn.q_a_proj": {
320
+ "bits": 8
321
+ },
322
+ "model.layers.23.self_attn.q_b_proj": {
323
+ "bits": 8
324
+ },
325
+ "model.layers.24.self_attn.kv_a_proj_with_mqa": {
326
+ "bits": 8
327
+ },
328
+ "model.layers.24.self_attn.kv_b_proj": {
329
+ "bits": 8
330
+ },
331
+ "model.layers.24.self_attn.o_proj": {
332
+ "bits": 8
333
+ },
334
+ "model.layers.24.self_attn.q_a_proj": {
335
+ "bits": 8
336
+ },
337
+ "model.layers.24.self_attn.q_b_proj": {
338
+ "bits": 8
339
+ },
340
+ "model.layers.25.self_attn.kv_a_proj_with_mqa": {
341
+ "bits": 8
342
+ },
343
+ "model.layers.25.self_attn.kv_b_proj": {
344
+ "bits": 8
345
+ },
346
+ "model.layers.25.self_attn.o_proj": {
347
+ "bits": 8
348
+ },
349
+ "model.layers.25.self_attn.q_a_proj": {
350
+ "bits": 8
351
+ },
352
+ "model.layers.25.self_attn.q_b_proj": {
353
+ "bits": 8
354
+ },
355
+ "model.layers.26.self_attn.kv_a_proj_with_mqa": {
356
+ "bits": 8
357
+ },
358
+ "model.layers.26.self_attn.kv_b_proj": {
359
+ "bits": 8
360
+ },
361
+ "model.layers.26.self_attn.o_proj": {
362
+ "bits": 8
363
+ },
364
+ "model.layers.26.self_attn.q_a_proj": {
365
+ "bits": 8
366
+ },
367
+ "model.layers.26.self_attn.q_b_proj": {
368
+ "bits": 8
369
+ },
370
+ "model.layers.27.self_attn.kv_a_proj_with_mqa": {
371
+ "bits": 8
372
+ },
373
+ "model.layers.27.self_attn.kv_b_proj": {
374
+ "bits": 8
375
+ },
376
+ "model.layers.27.self_attn.o_proj": {
377
+ "bits": 8
378
+ },
379
+ "model.layers.27.self_attn.q_a_proj": {
380
+ "bits": 8
381
+ },
382
+ "model.layers.27.self_attn.q_b_proj": {
383
+ "bits": 8
384
+ },
385
+ "model.layers.28.self_attn.kv_a_proj_with_mqa": {
386
+ "bits": 8
387
+ },
388
+ "model.layers.28.self_attn.kv_b_proj": {
389
+ "bits": 8
390
+ },
391
+ "model.layers.28.self_attn.o_proj": {
392
+ "bits": 8
393
+ },
394
+ "model.layers.28.self_attn.q_a_proj": {
395
+ "bits": 8
396
+ },
397
+ "model.layers.28.self_attn.q_b_proj": {
398
+ "bits": 8
399
+ },
400
+ "model.layers.29.self_attn.kv_a_proj_with_mqa": {
401
+ "bits": 8
402
+ },
403
+ "model.layers.29.self_attn.kv_b_proj": {
404
+ "bits": 8
405
+ },
406
+ "model.layers.29.self_attn.o_proj": {
407
+ "bits": 8
408
+ },
409
+ "model.layers.29.self_attn.q_a_proj": {
410
+ "bits": 8
411
+ },
412
+ "model.layers.29.self_attn.q_b_proj": {
413
+ "bits": 8
414
+ },
415
+ "model.layers.3.self_attn.kv_a_proj_with_mqa": {
416
+ "bits": 8
417
+ },
418
+ "model.layers.3.self_attn.kv_b_proj": {
419
+ "bits": 8
420
+ },
421
+ "model.layers.3.self_attn.o_proj": {
422
+ "bits": 8
423
+ },
424
+ "model.layers.3.self_attn.q_a_proj": {
425
+ "bits": 8
426
+ },
427
+ "model.layers.3.self_attn.q_b_proj": {
428
+ "bits": 8
429
+ },
430
+ "model.layers.30.self_attn.kv_a_proj_with_mqa": {
431
+ "bits": 8
432
+ },
433
+ "model.layers.30.self_attn.kv_b_proj": {
434
+ "bits": 8
435
+ },
436
+ "model.layers.30.self_attn.o_proj": {
437
+ "bits": 8
438
+ },
439
+ "model.layers.30.self_attn.q_a_proj": {
440
+ "bits": 8
441
+ },
442
+ "model.layers.30.self_attn.q_b_proj": {
443
+ "bits": 8
444
+ },
445
+ "model.layers.31.self_attn.kv_a_proj_with_mqa": {
446
+ "bits": 8
447
+ },
448
+ "model.layers.31.self_attn.kv_b_proj": {
449
+ "bits": 8
450
+ },
451
+ "model.layers.31.self_attn.o_proj": {
452
+ "bits": 8
453
+ },
454
+ "model.layers.31.self_attn.q_a_proj": {
455
+ "bits": 8
456
+ },
457
+ "model.layers.31.self_attn.q_b_proj": {
458
+ "bits": 8
459
+ },
460
+ "model.layers.32.self_attn.kv_a_proj_with_mqa": {
461
+ "bits": 8
462
+ },
463
+ "model.layers.32.self_attn.kv_b_proj": {
464
+ "bits": 8
465
+ },
466
+ "model.layers.32.self_attn.o_proj": {
467
+ "bits": 8
468
+ },
469
+ "model.layers.32.self_attn.q_a_proj": {
470
+ "bits": 8
471
+ },
472
+ "model.layers.32.self_attn.q_b_proj": {
473
+ "bits": 8
474
+ },
475
+ "model.layers.33.self_attn.kv_a_proj_with_mqa": {
476
+ "bits": 8
477
+ },
478
+ "model.layers.33.self_attn.kv_b_proj": {
479
+ "bits": 8
480
+ },
481
+ "model.layers.33.self_attn.o_proj": {
482
+ "bits": 8
483
+ },
484
+ "model.layers.33.self_attn.q_a_proj": {
485
+ "bits": 8
486
+ },
487
+ "model.layers.33.self_attn.q_b_proj": {
488
+ "bits": 8
489
+ },
490
+ "model.layers.34.self_attn.kv_a_proj_with_mqa": {
491
+ "bits": 8
492
+ },
493
+ "model.layers.34.self_attn.kv_b_proj": {
494
+ "bits": 8
495
+ },
496
+ "model.layers.34.self_attn.o_proj": {
497
+ "bits": 8
498
+ },
499
+ "model.layers.34.self_attn.q_a_proj": {
500
+ "bits": 8
501
+ },
502
+ "model.layers.34.self_attn.q_b_proj": {
503
+ "bits": 8
504
+ },
505
+ "model.layers.35.self_attn.kv_a_proj_with_mqa": {
506
+ "bits": 8
507
+ },
508
+ "model.layers.35.self_attn.kv_b_proj": {
509
+ "bits": 8
510
+ },
511
+ "model.layers.35.self_attn.o_proj": {
512
+ "bits": 8
513
+ },
514
+ "model.layers.35.self_attn.q_a_proj": {
515
+ "bits": 8
516
+ },
517
+ "model.layers.35.self_attn.q_b_proj": {
518
+ "bits": 8
519
+ },
520
+ "model.layers.36.self_attn.kv_a_proj_with_mqa": {
521
+ "bits": 8
522
+ },
523
+ "model.layers.36.self_attn.kv_b_proj": {
524
+ "bits": 8
525
+ },
526
+ "model.layers.36.self_attn.o_proj": {
527
+ "bits": 8
528
+ },
529
+ "model.layers.36.self_attn.q_a_proj": {
530
+ "bits": 8
531
+ },
532
+ "model.layers.36.self_attn.q_b_proj": {
533
+ "bits": 8
534
+ },
535
+ "model.layers.37.self_attn.kv_a_proj_with_mqa": {
536
+ "bits": 8
537
+ },
538
+ "model.layers.37.self_attn.kv_b_proj": {
539
+ "bits": 8
540
+ },
541
+ "model.layers.37.self_attn.o_proj": {
542
+ "bits": 8
543
+ },
544
+ "model.layers.37.self_attn.q_a_proj": {
545
+ "bits": 8
546
+ },
547
+ "model.layers.37.self_attn.q_b_proj": {
548
+ "bits": 8
549
+ },
550
+ "model.layers.38.self_attn.kv_a_proj_with_mqa": {
551
+ "bits": 8
552
+ },
553
+ "model.layers.38.self_attn.kv_b_proj": {
554
+ "bits": 8
555
+ },
556
+ "model.layers.38.self_attn.o_proj": {
557
+ "bits": 8
558
+ },
559
+ "model.layers.38.self_attn.q_a_proj": {
560
+ "bits": 8
561
+ },
562
+ "model.layers.38.self_attn.q_b_proj": {
563
+ "bits": 8
564
+ },
565
+ "model.layers.39.self_attn.kv_a_proj_with_mqa": {
566
+ "bits": 8
567
+ },
568
+ "model.layers.39.self_attn.kv_b_proj": {
569
+ "bits": 8
570
+ },
571
+ "model.layers.39.self_attn.o_proj": {
572
+ "bits": 8
573
+ },
574
+ "model.layers.39.self_attn.q_a_proj": {
575
+ "bits": 8
576
+ },
577
+ "model.layers.39.self_attn.q_b_proj": {
578
+ "bits": 8
579
+ },
580
+ "model.layers.4.self_attn.kv_a_proj_with_mqa": {
581
+ "bits": 8
582
+ },
583
+ "model.layers.4.self_attn.kv_b_proj": {
584
+ "bits": 8
585
+ },
586
+ "model.layers.4.self_attn.o_proj": {
587
+ "bits": 8
588
+ },
589
+ "model.layers.4.self_attn.q_a_proj": {
590
+ "bits": 8
591
+ },
592
+ "model.layers.4.self_attn.q_b_proj": {
593
+ "bits": 8
594
+ },
595
+ "model.layers.40.self_attn.kv_a_proj_with_mqa": {
596
+ "bits": 8
597
+ },
598
+ "model.layers.40.self_attn.kv_b_proj": {
599
+ "bits": 8
600
+ },
601
+ "model.layers.40.self_attn.o_proj": {
602
+ "bits": 8
603
+ },
604
+ "model.layers.40.self_attn.q_a_proj": {
605
+ "bits": 8
606
+ },
607
+ "model.layers.40.self_attn.q_b_proj": {
608
+ "bits": 8
609
+ },
610
+ "model.layers.41.self_attn.kv_a_proj_with_mqa": {
611
+ "bits": 8
612
+ },
613
+ "model.layers.41.self_attn.kv_b_proj": {
614
+ "bits": 8
615
+ },
616
+ "model.layers.41.self_attn.o_proj": {
617
+ "bits": 8
618
+ },
619
+ "model.layers.41.self_attn.q_a_proj": {
620
+ "bits": 8
621
+ },
622
+ "model.layers.41.self_attn.q_b_proj": {
623
+ "bits": 8
624
+ },
625
+ "model.layers.42.self_attn.kv_a_proj_with_mqa": {
626
+ "bits": 8
627
+ },
628
+ "model.layers.42.self_attn.kv_b_proj": {
629
+ "bits": 8
630
+ },
631
+ "model.layers.42.self_attn.o_proj": {
632
+ "bits": 8
633
+ },
634
+ "model.layers.42.self_attn.q_a_proj": {
635
+ "bits": 8
636
+ },
637
+ "model.layers.42.self_attn.q_b_proj": {
638
+ "bits": 8
639
+ },
640
+ "model.layers.43.self_attn.kv_a_proj_with_mqa": {
641
+ "bits": 8
642
+ },
643
+ "model.layers.43.self_attn.kv_b_proj": {
644
+ "bits": 8
645
+ },
646
+ "model.layers.43.self_attn.o_proj": {
647
+ "bits": 8
648
+ },
649
+ "model.layers.43.self_attn.q_a_proj": {
650
+ "bits": 8
651
+ },
652
+ "model.layers.43.self_attn.q_b_proj": {
653
+ "bits": 8
654
+ },
655
+ "model.layers.44.self_attn.kv_a_proj_with_mqa": {
656
+ "bits": 8
657
+ },
658
+ "model.layers.44.self_attn.kv_b_proj": {
659
+ "bits": 8
660
+ },
661
+ "model.layers.44.self_attn.o_proj": {
662
+ "bits": 8
663
+ },
664
+ "model.layers.44.self_attn.q_a_proj": {
665
+ "bits": 8
666
+ },
667
+ "model.layers.44.self_attn.q_b_proj": {
668
+ "bits": 8
669
+ },
670
+ "model.layers.45.self_attn.kv_a_proj_with_mqa": {
671
+ "bits": 8
672
+ },
673
+ "model.layers.45.self_attn.kv_b_proj": {
674
+ "bits": 8
675
+ },
676
+ "model.layers.45.self_attn.o_proj": {
677
+ "bits": 8
678
+ },
679
+ "model.layers.45.self_attn.q_a_proj": {
680
+ "bits": 8
681
+ },
682
+ "model.layers.45.self_attn.q_b_proj": {
683
+ "bits": 8
684
+ },
685
+ "model.layers.46.self_attn.kv_a_proj_with_mqa": {
686
+ "bits": 8
687
+ },
688
+ "model.layers.46.self_attn.kv_b_proj": {
689
+ "bits": 8
690
+ },
691
+ "model.layers.46.self_attn.o_proj": {
692
+ "bits": 8
693
+ },
694
+ "model.layers.46.self_attn.q_a_proj": {
695
+ "bits": 8
696
+ },
697
+ "model.layers.46.self_attn.q_b_proj": {
698
+ "bits": 8
699
+ },
700
+ "model.layers.47.self_attn.kv_a_proj_with_mqa": {
701
+ "bits": 8
702
+ },
703
+ "model.layers.47.self_attn.kv_b_proj": {
704
+ "bits": 8
705
+ },
706
+ "model.layers.47.self_attn.o_proj": {
707
+ "bits": 8
708
+ },
709
+ "model.layers.47.self_attn.q_a_proj": {
710
+ "bits": 8
711
+ },
712
+ "model.layers.47.self_attn.q_b_proj": {
713
+ "bits": 8
714
+ },
715
+ "model.layers.48.self_attn.kv_a_proj_with_mqa": {
716
+ "bits": 8
717
+ },
718
+ "model.layers.48.self_attn.kv_b_proj": {
719
+ "bits": 8
720
+ },
721
+ "model.layers.48.self_attn.o_proj": {
722
+ "bits": 8
723
+ },
724
+ "model.layers.48.self_attn.q_a_proj": {
725
+ "bits": 8
726
+ },
727
+ "model.layers.48.self_attn.q_b_proj": {
728
+ "bits": 8
729
+ },
730
+ "model.layers.49.self_attn.kv_a_proj_with_mqa": {
731
+ "bits": 8
732
+ },
733
+ "model.layers.49.self_attn.kv_b_proj": {
734
+ "bits": 8
735
+ },
736
+ "model.layers.49.self_attn.o_proj": {
737
+ "bits": 8
738
+ },
739
+ "model.layers.49.self_attn.q_a_proj": {
740
+ "bits": 8
741
+ },
742
+ "model.layers.49.self_attn.q_b_proj": {
743
+ "bits": 8
744
+ },
745
+ "model.layers.5.self_attn.kv_a_proj_with_mqa": {
746
+ "bits": 8
747
+ },
748
+ "model.layers.5.self_attn.kv_b_proj": {
749
+ "bits": 8
750
+ },
751
+ "model.layers.5.self_attn.o_proj": {
752
+ "bits": 8
753
+ },
754
+ "model.layers.5.self_attn.q_a_proj": {
755
+ "bits": 8
756
+ },
757
+ "model.layers.5.self_attn.q_b_proj": {
758
+ "bits": 8
759
+ },
760
+ "model.layers.50.self_attn.kv_a_proj_with_mqa": {
761
+ "bits": 8
762
+ },
763
+ "model.layers.50.self_attn.kv_b_proj": {
764
+ "bits": 8
765
+ },
766
+ "model.layers.50.self_attn.o_proj": {
767
+ "bits": 8
768
+ },
769
+ "model.layers.50.self_attn.q_a_proj": {
770
+ "bits": 8
771
+ },
772
+ "model.layers.50.self_attn.q_b_proj": {
773
+ "bits": 8
774
+ },
775
+ "model.layers.51.self_attn.kv_a_proj_with_mqa": {
776
+ "bits": 8
777
+ },
778
+ "model.layers.51.self_attn.kv_b_proj": {
779
+ "bits": 8
780
+ },
781
+ "model.layers.51.self_attn.o_proj": {
782
+ "bits": 8
783
+ },
784
+ "model.layers.51.self_attn.q_a_proj": {
785
+ "bits": 8
786
+ },
787
+ "model.layers.51.self_attn.q_b_proj": {
788
+ "bits": 8
789
+ },
790
+ "model.layers.52.self_attn.kv_a_proj_with_mqa": {
791
+ "bits": 8
792
+ },
793
+ "model.layers.52.self_attn.kv_b_proj": {
794
+ "bits": 8
795
+ },
796
+ "model.layers.52.self_attn.o_proj": {
797
+ "bits": 8
798
+ },
799
+ "model.layers.52.self_attn.q_a_proj": {
800
+ "bits": 8
801
+ },
802
+ "model.layers.52.self_attn.q_b_proj": {
803
+ "bits": 8
804
+ },
805
+ "model.layers.53.self_attn.kv_a_proj_with_mqa": {
806
+ "bits": 8
807
+ },
808
+ "model.layers.53.self_attn.kv_b_proj": {
809
+ "bits": 8
810
+ },
811
+ "model.layers.53.self_attn.o_proj": {
812
+ "bits": 8
813
+ },
814
+ "model.layers.53.self_attn.q_a_proj": {
815
+ "bits": 8
816
+ },
817
+ "model.layers.53.self_attn.q_b_proj": {
818
+ "bits": 8
819
+ },
820
+ "model.layers.54.self_attn.kv_a_proj_with_mqa": {
821
+ "bits": 8
822
+ },
823
+ "model.layers.54.self_attn.kv_b_proj": {
824
+ "bits": 8
825
+ },
826
+ "model.layers.54.self_attn.o_proj": {
827
+ "bits": 8
828
+ },
829
+ "model.layers.54.self_attn.q_a_proj": {
830
+ "bits": 8
831
+ },
832
+ "model.layers.54.self_attn.q_b_proj": {
833
+ "bits": 8
834
+ },
835
+ "model.layers.55.self_attn.kv_a_proj_with_mqa": {
836
+ "bits": 8
837
+ },
838
+ "model.layers.55.self_attn.kv_b_proj": {
839
+ "bits": 8
840
+ },
841
+ "model.layers.55.self_attn.o_proj": {
842
+ "bits": 8
843
+ },
844
+ "model.layers.55.self_attn.q_a_proj": {
845
+ "bits": 8
846
+ },
847
+ "model.layers.55.self_attn.q_b_proj": {
848
+ "bits": 8
849
+ },
850
+ "model.layers.56.self_attn.kv_a_proj_with_mqa": {
851
+ "bits": 8
852
+ },
853
+ "model.layers.56.self_attn.kv_b_proj": {
854
+ "bits": 8
855
+ },
856
+ "model.layers.56.self_attn.o_proj": {
857
+ "bits": 8
858
+ },
859
+ "model.layers.56.self_attn.q_a_proj": {
860
+ "bits": 8
861
+ },
862
+ "model.layers.56.self_attn.q_b_proj": {
863
+ "bits": 8
864
+ },
865
+ "model.layers.57.self_attn.kv_a_proj_with_mqa": {
866
+ "bits": 8
867
+ },
868
+ "model.layers.57.self_attn.kv_b_proj": {
869
+ "bits": 8
870
+ },
871
+ "model.layers.57.self_attn.o_proj": {
872
+ "bits": 8
873
+ },
874
+ "model.layers.57.self_attn.q_a_proj": {
875
+ "bits": 8
876
+ },
877
+ "model.layers.57.self_attn.q_b_proj": {
878
+ "bits": 8
879
+ },
880
+ "model.layers.58.self_attn.kv_a_proj_with_mqa": {
881
+ "bits": 8
882
+ },
883
+ "model.layers.58.self_attn.kv_b_proj": {
884
+ "bits": 8
885
+ },
886
+ "model.layers.58.self_attn.o_proj": {
887
+ "bits": 8
888
+ },
889
+ "model.layers.58.self_attn.q_a_proj": {
890
+ "bits": 8
891
+ },
892
+ "model.layers.58.self_attn.q_b_proj": {
893
+ "bits": 8
894
+ },
895
+ "model.layers.59.self_attn.kv_a_proj_with_mqa": {
896
+ "bits": 8
897
+ },
898
+ "model.layers.59.self_attn.kv_b_proj": {
899
+ "bits": 8
900
+ },
901
+ "model.layers.59.self_attn.o_proj": {
902
+ "bits": 8
903
+ },
904
+ "model.layers.59.self_attn.q_a_proj": {
905
+ "bits": 8
906
+ },
907
+ "model.layers.59.self_attn.q_b_proj": {
908
+ "bits": 8
909
+ },
910
+ "model.layers.6.self_attn.kv_a_proj_with_mqa": {
911
+ "bits": 8
912
+ },
913
+ "model.layers.6.self_attn.kv_b_proj": {
914
+ "bits": 8
915
+ },
916
+ "model.layers.6.self_attn.o_proj": {
917
+ "bits": 8
918
+ },
919
+ "model.layers.6.self_attn.q_a_proj": {
920
+ "bits": 8
921
+ },
922
+ "model.layers.6.self_attn.q_b_proj": {
923
+ "bits": 8
924
+ },
925
+ "model.layers.60.self_attn.kv_a_proj_with_mqa": {
926
+ "bits": 8
927
+ },
928
+ "model.layers.60.self_attn.kv_b_proj": {
929
+ "bits": 8
930
+ },
931
+ "model.layers.60.self_attn.o_proj": {
932
+ "bits": 8
933
+ },
934
+ "model.layers.60.self_attn.q_a_proj": {
935
+ "bits": 8
936
+ },
937
+ "model.layers.60.self_attn.q_b_proj": {
938
+ "bits": 8
939
+ },
940
+ "model.layers.7.self_attn.kv_a_proj_with_mqa": {
941
+ "bits": 8
942
+ },
943
+ "model.layers.7.self_attn.kv_b_proj": {
944
+ "bits": 8
945
+ },
946
+ "model.layers.7.self_attn.o_proj": {
947
+ "bits": 8
948
+ },
949
+ "model.layers.7.self_attn.q_a_proj": {
950
+ "bits": 8
951
+ },
952
+ "model.layers.7.self_attn.q_b_proj": {
953
+ "bits": 8
954
+ },
955
+ "model.layers.8.self_attn.kv_a_proj_with_mqa": {
956
+ "bits": 8
957
+ },
958
+ "model.layers.8.self_attn.kv_b_proj": {
959
+ "bits": 8
960
+ },
961
+ "model.layers.8.self_attn.o_proj": {
962
+ "bits": 8
963
+ },
964
+ "model.layers.8.self_attn.q_a_proj": {
965
+ "bits": 8
966
+ },
967
+ "model.layers.8.self_attn.q_b_proj": {
968
+ "bits": 8
969
+ },
970
+ "model.layers.9.self_attn.kv_a_proj_with_mqa": {
971
+ "bits": 8
972
+ },
973
+ "model.layers.9.self_attn.kv_b_proj": {
974
+ "bits": 8
975
+ },
976
+ "model.layers.9.self_attn.o_proj": {
977
+ "bits": 8
978
+ },
979
+ "model.layers.9.self_attn.q_a_proj": {
980
+ "bits": 8
981
+ },
982
+ "model.layers.9.self_attn.q_b_proj": {
983
+ "bits": 8
984
+ }
985
+ },
986
+ "group_size": 128,
987
+ "iters": 0,
988
+ "packing_format": "auto_round:auto_gptq",
989
+ "quant_method": "auto-round",
990
+ "sym": true
991
+ },
992
+ "rms_norm_eps": 1e-06,
993
+ "rope_scaling": {
994
+ "beta_fast": 32,
995
+ "beta_slow": 1,
996
+ "factor": 40,
997
+ "mscale": 1.0,
998
+ "mscale_all_dim": 1.0,
999
+ "original_max_position_embeddings": 4096,
1000
+ "type": "yarn"
1001
+ },
1002
+ "rope_theta": 10000,
1003
+ "routed_scaling_factor": 2.5,
1004
+ "scoring_func": "sigmoid",
1005
+ "tie_word_embeddings": false,
1006
+ "topk_group": 4,
1007
+ "topk_method": "noaux_tc",
1008
+ "torch_dtype": "bfloat16",
1009
+ "transformers_version": "4.56.0.dev0",
1010
+ "unsloth_fixed": true,
1011
+ "use_cache": true,
1012
+ "v_head_dim": 128,
1013
+ "vocab_size": 129280
1014
+ }
configuration_deepseek.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.configuration_utils import PretrainedConfig
2
+ from transformers.utils import logging
3
+
4
+ logger = logging.get_logger(__name__)
5
+
6
+ DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
7
+ class DeepseekV3Config(PretrainedConfig):
8
+ r"""
9
+ This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
10
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
11
+ defaults will yield a similar configuration to that of the DeepSeek-V3.
12
+
13
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
14
+ documentation from [`PretrainedConfig`] for more information.
15
+
16
+
17
+ Args:
18
+ vocab_size (`int`, *optional*, defaults to 129280):
19
+ Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
20
+ `inputs_ids` passed when calling [`DeepseekV3Model`]
21
+ hidden_size (`int`, *optional*, defaults to 4096):
22
+ Dimension of the hidden representations.
23
+ intermediate_size (`int`, *optional*, defaults to 11008):
24
+ Dimension of the MLP representations.
25
+ moe_intermediate_size (`int`, *optional*, defaults to 1407):
26
+ Dimension of the MoE representations.
27
+ num_hidden_layers (`int`, *optional*, defaults to 32):
28
+ Number of hidden layers in the Transformer decoder.
29
+ num_nextn_predict_layers (`int`, *optional*, defaults to 1):
30
+ Number of nextn predict layers in the DeepSeekV3 Model.
31
+ num_attention_heads (`int`, *optional*, defaults to 32):
32
+ Number of attention heads for each attention layer in the Transformer decoder.
33
+ n_shared_experts (`int`, *optional*, defaults to None):
34
+ Number of shared experts, None means dense model.
35
+ n_routed_experts (`int`, *optional*, defaults to None):
36
+ Number of routed experts, None means dense model.
37
+ routed_scaling_factor (`float`, *optional*, defaults to 1.0):
38
+ Scaling factor or routed experts.
39
+ topk_method (`str`, *optional*, defaults to `gready`):
40
+ Topk method used in routed gate.
41
+ n_group (`int`, *optional*, defaults to None):
42
+ Number of groups for routed experts.
43
+ topk_group (`int`, *optional*, defaults to None):
44
+ Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
45
+ num_experts_per_tok (`int`, *optional*, defaults to None):
46
+ Number of selected experts, None means dense model.
47
+ moe_layer_freq (`int`, *optional*, defaults to 1):
48
+ The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
49
+ first_k_dense_replace (`int`, *optional*, defaults to 0):
50
+ Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
51
+ \--k dense layers--/
52
+ norm_topk_prob (`bool`, *optional*, defaults to False):
53
+ Whether to normalize the weights of the routed experts.
54
+ scoring_func (`str`, *optional*, defaults to 'softmax'):
55
+ Method of computing expert weights.
56
+ aux_loss_alpha (`float`, *optional*, defaults to 0.001):
57
+ Auxiliary loss weight coefficient.
58
+ seq_aux = (`bool`, *optional*, defaults to True):
59
+ Whether to compute the auxiliary loss for each individual sample.
60
+ num_key_value_heads (`int`, *optional*):
61
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
62
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
63
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
64
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
65
+ by meanpooling all the original heads within that group. For more details checkout [this
66
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
67
+ `num_attention_heads`.
68
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
69
+ The non-linear activation function (function or string) in the decoder.
70
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
71
+ The maximum sequence length that this model might ever be used with.
72
+ initializer_range (`float`, *optional*, defaults to 0.02):
73
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
74
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
75
+ The epsilon used by the rms normalization layers.
76
+ use_cache (`bool`, *optional*, defaults to `True`):
77
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
78
+ relevant if `config.is_decoder=True`.
79
+ pad_token_id (`int`, *optional*):
80
+ Padding token id.
81
+ bos_token_id (`int`, *optional*, defaults to 1):
82
+ Beginning of stream token id.
83
+ eos_token_id (`int`, *optional*, defaults to 2):
84
+ End of stream token id.
85
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
86
+ Whether to tie weight embeddings
87
+ rope_theta (`float`, *optional*, defaults to 10000.0):
88
+ The base period of the RoPE embeddings.
89
+ rope_scaling (`Dict`, *optional*):
90
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
91
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
92
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
93
+ `max_position_embeddings` to the expected new maximum.
94
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
95
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
96
+ attention_dropout (`float`, *optional*, defaults to 0.0):
97
+ The dropout ratio for the attention probabilities.
98
+
99
+ ```python
100
+ >>> from transformers import DeepseekV3Model, DeepseekV3Config
101
+
102
+ >>> # Initializing a Deepseek-V3 style configuration
103
+ >>> configuration = DeepseekV3Config()
104
+
105
+ >>> # Accessing the model configuration
106
+ >>> configuration = model.config
107
+ ```"""
108
+
109
+ model_type = "deepseek_v3"
110
+ keys_to_ignore_at_inference = ["past_key_values"]
111
+
112
+ def __init__(
113
+ self,
114
+ vocab_size=129280,
115
+ hidden_size=7168,
116
+ intermediate_size=18432,
117
+ moe_intermediate_size = 2048,
118
+ num_hidden_layers=61,
119
+ num_nextn_predict_layers=1,
120
+ num_attention_heads=128,
121
+ num_key_value_heads=128,
122
+ n_shared_experts = 1,
123
+ n_routed_experts = 256,
124
+ ep_size = 1,
125
+ routed_scaling_factor = 2.5,
126
+ kv_lora_rank = 512,
127
+ q_lora_rank = 1536,
128
+ qk_rope_head_dim = 64,
129
+ v_head_dim = 128,
130
+ qk_nope_head_dim = 128,
131
+ topk_method = 'noaux_tc',
132
+ n_group = 8,
133
+ topk_group = 4,
134
+ num_experts_per_tok = 8,
135
+ moe_layer_freq = 1,
136
+ first_k_dense_replace = 3,
137
+ norm_topk_prob = True,
138
+ scoring_func = 'sigmoid',
139
+ hidden_act="silu",
140
+ max_position_embeddings=4096,
141
+ initializer_range=0.02,
142
+ rms_norm_eps=1e-6,
143
+ use_cache=True,
144
+ pad_token_id=None,
145
+ bos_token_id=0,
146
+ eos_token_id=1,
147
+ tie_word_embeddings=False,
148
+ rope_theta=10000.0,
149
+ rope_scaling=None,
150
+ attention_bias=False,
151
+ attention_dropout=0.0,
152
+ **kwargs,
153
+ ):
154
+ self.vocab_size = vocab_size
155
+ self.max_position_embeddings = max_position_embeddings
156
+ self.hidden_size = hidden_size
157
+ self.intermediate_size = intermediate_size
158
+ self.moe_intermediate_size = moe_intermediate_size
159
+ self.num_hidden_layers = num_hidden_layers
160
+ self.num_nextn_predict_layers = num_nextn_predict_layers
161
+ self.num_attention_heads = num_attention_heads
162
+ self.n_shared_experts = n_shared_experts
163
+ self.n_routed_experts = n_routed_experts
164
+ self.ep_size = ep_size
165
+ self.routed_scaling_factor = routed_scaling_factor
166
+ self.kv_lora_rank = kv_lora_rank
167
+ self.q_lora_rank = q_lora_rank
168
+ self.qk_rope_head_dim = qk_rope_head_dim
169
+ self.v_head_dim = v_head_dim
170
+ self.qk_nope_head_dim = qk_nope_head_dim
171
+ self.topk_method = topk_method
172
+ self.n_group = n_group
173
+ self.topk_group = topk_group
174
+ self.num_experts_per_tok = num_experts_per_tok
175
+ self.moe_layer_freq = moe_layer_freq
176
+ self.first_k_dense_replace = first_k_dense_replace
177
+ self.norm_topk_prob = norm_topk_prob
178
+ self.scoring_func = scoring_func
179
+ # for backward compatibility
180
+ if num_key_value_heads is None:
181
+ num_key_value_heads = num_attention_heads
182
+
183
+ self.num_key_value_heads = num_key_value_heads
184
+ self.hidden_act = hidden_act
185
+ self.initializer_range = initializer_range
186
+ self.rms_norm_eps = rms_norm_eps
187
+ self.use_cache = use_cache
188
+ self.rope_theta = rope_theta
189
+ self.rope_scaling = rope_scaling
190
+ self.attention_bias = attention_bias
191
+ self.attention_dropout = attention_dropout
192
+
193
+ super().__init__(
194
+ pad_token_id=pad_token_id,
195
+ bos_token_id=bos_token_id,
196
+ eos_token_id=eos_token_id,
197
+ tie_word_embeddings=tie_word_embeddings,
198
+ **kwargs,
199
+ )
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "do_sample": true,
5
+ "eos_token_id": 1,
6
+ "temperature": 0.6,
7
+ "top_p": 0.95,
8
+ "transformers_version": "4.56.0.dev0"
9
+ }
model-00001-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca84f9f40ef334eef962405e9688404ae652eb85eef733e7dfe9fd7d8e40dc6d
3
+ size 4995710664
model-00002-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5a5ba61ae9bf2b3907e48105177902846e063f40a76301c19907c68f47c95b1
3
+ size 5000051392
model-00003-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b08a3f557e84a92e41f1e92f99f550cc0d8a8442f21b95eb3fa480703eecc1f8
3
+ size 4995991608
model-00004-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7020b0c09d1aca0b1a508d4d3039b3904dbd74953334cd8a2769ec3608b9108b
3
+ size 5000050912
model-00005-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72f624ca05d3b54dfc34267ee659ba12d15d5b372db2feec13fe2b496a2d8211
3
+ size 5000222944
model-00006-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3a0f1fb2164bab6a13fa7a54501d09996c34d7e41a6cb2d39d9d621e0410ad4
3
+ size 4992940416
model-00007-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4be835f87031118ce99f89b55553273a1956f44fb987f632e5bdc7374688407
3
+ size 5000051336
model-00008-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f68acaf80fe8ed102b0a1f946c9fc49548de37d408c89460d8fd3a49d98f49dd
3
+ size 4995991936
model-00009-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:347da340973bc3c9e78814c9b0b37007d2342092961b2260ecb8360db190d912
3
+ size 5000050912
model-00010-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f808ce67c8deee35f928cd4f507d0b7d91b9782752adce381940569cb390982f
3
+ size 5000224376
model-00011-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9c32b02f7fc2fbaa588c4a1a561c3c845b5cdcc97e4f9d4d990558d3db30414
3
+ size 4992942152
model-00012-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9112c487f3ecf185605d4cf4dac92138519dbe5ace64f3a9be1c9a04630eea4b
3
+ size 5000053152
model-00013-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c184fc0ee3bf5e33047f40f4c2855844c909040a4b95b2cff34bc99b018ff625
3
+ size 5000225280
model-00014-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9909ab2a8a7ce7bc5a7c5b19feeb2d9f3e060c4d0ca8179c58db9a738be83436
3
+ size 4996051216
model-00015-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da31552606f00a3a21049637416f845c0f55cedc40e060f120114447503cf0d3
3
+ size 5000052816
model-00016-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0bc3bad1cac7808d24cd57b4cb3409052a9b06de766d9413fea2f8c868b6600
3
+ size 5000224856
model-00017-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17b5c9c247567c7624beea886458abeae76c9958aacf888925f01879dd5ac209
3
+ size 4992942208
model-00018-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c04f2c0095391732d42259480fda5d2219e457d01e0179e36e450ca078f4db3
3
+ size 5000053248
model-00019-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7aba70afcdf3ce089821a36d63c5dbfec323c4bac17c8354057770ecb9a11e3
3
+ size 4887946080
model-00020-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10a936066825e6e83310f5beda543af76caa7454c228261ccb6ebcecb388b5d9
3
+ size 4993923416
model-00021-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbf954fdfee1482d82ae4b7dd3b81f3c96e94503c3cd0b879cd84d8601f39e6b
3
+ size 5000052816
model-00022-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1e391cdccc84a5b3ec1723c5f871e2070a0d521ff77c356bf36ae76f6878b03
3
+ size 5000224856
model-00023-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:702015a7919d71819afb95f0c9e3cf74216fcf4596f9062189e73fcf569f9d39
3
+ size 4992942408
model-00024-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:422dd7e4209744716eeaa76779f30457b7d1b2e6bf105d4947e671ce6921e523
3
+ size 5000053248
model-00025-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:476c13bc42d1fffcabcf1f83229aa435201c5a2f92e008305e3278d674fa792e
3
+ size 4995993824
model-00026-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:441cf0656c2358e3e15a326b8f280de0ae09a4b7481e0752d85571a6e7ad76c1
3
+ size 5000052816
model-00027-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c0eaca5ef29ed2e0b985af88687d13699f075332e45f006c66530e0c740985c
3
+ size 5000224856
model-00028-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4db427a1585b57fdd8fc3b84138c00548c9b310e0d5fb2fe3f69944eae6dc5d0
3
+ size 4992942152
model-00029-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e276ebe719c214518a53b38396ad526159d32071d8df3fbd99316e9add87be57
3
+ size 5000053232
model-00030-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9ae27cc538f8ab8da4ce267c76d7b0f4dc33bccaf5067a0b12ba5cf143a2e31
3
+ size 5000225304
model-00031-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:983b30e8172ebda2b844d5287959e8c50d731fa3371a207257d680aa5b9121cb
3
+ size 4996051112
model-00032-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ee2587fcdbcb7966e2e46e557b9a86df222f86fc9ff3c78a37b08b45c1134b5
3
+ size 5000052816
model-00033-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3d88968cec7322b8b0559f518f0d2b3fab1adbbe1940a3e0078fdb810f4a040
3
+ size 5000224856
model-00034-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0f2715617fe58702606a3891d3eff57ce5f6b402cbeeefacea858a64b3f0b01
3
+ size 4992942288
model-00035-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa403dc7a766d421d56db4fab0e8144c33307cbface657e58ce08dd0723a21d7
3
+ size 5000053248
model-00036-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3af20d5f58e03c46ebcb02a9127eb3a80cd96257e778c7000b2a28a609c1864f
3
+ size 4999664696
model-00037-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74459e2953f3b36afa8426eab858336930b597ac3c652bb16e30ba4fb2f301db
3
+ size 4996611560
model-00038-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3d06df18ed937935a1fbafb1d212ac8c0c5225f2a32285f8f364092586c45f5
3
+ size 5000052816
model-00039-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8de186711e1ef31eea8eb178c4cb45f39130d77623617982726c2f73ec384667
3
+ size 5000224856
model-00040-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5ab209db638b7e349a3f8266abed57b7bcb84320b4e33aff59e8e2aff184f5b
3
+ size 4992942440
model-00041-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b52b41a366e94ca5c038940517cd4f594850c5bcc2cc65f27686e267e99b7675
3
+ size 5000053248
model-00042-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4f148561dd6769a5ff5c8242aaea132cbe7c13031dc6af0f5b7a72bb5800d2a
3
+ size 4995993792
model-00043-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b06564b3422d93e8d8a171ab270de1baff4e4aca1d997b06a496079b1078f78c
3
+ size 5000052816
model-00044-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff9c0c8c6a4aef00b34a1b2766ae347ab853629c7da6f9c48e942c65a55c381c
3
+ size 5000224856
model-00045-of-00072.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c4b7abd0c88a285639dcefe4ed905d0d0b31958c94ea3353df623737a0e3f27
3
+ size 4992942176