yujiepan commited on
Commit
8d4a095
·
verified ·
1 Parent(s): 7901f48

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +16 -13
  2. config.json +2 -2
  3. model.safetensors +2 -2
README.md CHANGED
@@ -17,7 +17,11 @@ This tiny model is for debugging. It is randomly initialized with the config ada
17
  - vLLM
18
 
19
  ```bash
20
- vllm serve yujiepan/deepseek-v3.1-tiny-random --trust-remote-code
 
 
 
 
21
  ```
22
 
23
  - Transformers
@@ -57,7 +61,6 @@ from transformers import (
57
  GenerationConfig,
58
  set_seed,
59
  )
60
-
61
  from transformers.models.glm4_moe.modeling_glm4_moe import Glm4MoeRMSNorm
62
  source_model_id = "deepseek-ai/DeepSeek-V3.1-Base"
63
  save_folder = "/tmp/yujiepan/deepseek-v3.1-tiny-random"
@@ -79,9 +82,9 @@ config_json.update({
79
  'moe_intermediate_size': 64,
80
  'n_routed_experts': 32,
81
  'n_shared_experts': 1,
82
- 'num_attention_heads': 1,
83
  'num_experts_per_tok': 8,
84
- 'num_key_value_heads': 1,
85
  'q_lora_rank': 32,
86
  'qk_nope_head_dim': 64,
87
  'qk_rope_head_dim': 192, # vllm mla kernel supports 576 only, FA supports head dim <= 256
@@ -169,11 +172,11 @@ DeepseekV3ForCausalLM(
169
  (self_attn): DeepseekV3Attention(
170
  (q_a_proj): Linear(in_features=8, out_features=32, bias=False)
171
  (q_a_layernorm): DeepseekV3RMSNorm()
172
- (q_b_proj): Linear(in_features=32, out_features=256, bias=False)
173
  (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
174
  (kv_a_layernorm): DeepseekV3RMSNorm()
175
- (kv_b_proj): Linear(in_features=384, out_features=128, bias=False)
176
- (o_proj): Linear(in_features=64, out_features=8, bias=False)
177
  (rotary_emb): DeepseekV3YarnRotaryEmbedding()
178
  )
179
  (mlp): DeepseekV3MLP(
@@ -189,11 +192,11 @@ DeepseekV3ForCausalLM(
189
  (self_attn): DeepseekV3Attention(
190
  (q_a_proj): Linear(in_features=8, out_features=32, bias=False)
191
  (q_a_layernorm): DeepseekV3RMSNorm()
192
- (q_b_proj): Linear(in_features=32, out_features=256, bias=False)
193
  (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
194
  (kv_a_layernorm): DeepseekV3RMSNorm()
195
- (kv_b_proj): Linear(in_features=384, out_features=128, bias=False)
196
- (o_proj): Linear(in_features=64, out_features=8, bias=False)
197
  (rotary_emb): DeepseekV3YarnRotaryEmbedding()
198
  )
199
  (mlp): DeepseekV3MoE(
@@ -220,11 +223,11 @@ DeepseekV3ForCausalLM(
220
  (self_attn): DeepseekV3Attention(
221
  (q_a_proj): Linear(in_features=8, out_features=32, bias=False)
222
  (q_a_layernorm): DeepseekV3RMSNorm()
223
- (q_b_proj): Linear(in_features=32, out_features=256, bias=False)
224
  (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
225
  (kv_a_layernorm): DeepseekV3RMSNorm()
226
- (kv_b_proj): Linear(in_features=384, out_features=128, bias=False)
227
- (o_proj): Linear(in_features=64, out_features=8, bias=False)
228
  (rotary_emb): DeepseekV3YarnRotaryEmbedding()
229
  )
230
  (mlp): DeepseekV3MoE(
 
17
  - vLLM
18
 
19
  ```bash
20
+ python -m vllm.entrypoints.openai.api_server \
21
+ --tensor-parallel-size 2 \
22
+ --model yujiepan/deepseek-v3.1-tiny-random \
23
+ --trust-remote-code \
24
+ --speculative-config='{"method": "deepseek_mtp", "num_speculative_tokens": 1}'
25
  ```
26
 
27
  - Transformers
 
61
  GenerationConfig,
62
  set_seed,
63
  )
 
64
  from transformers.models.glm4_moe.modeling_glm4_moe import Glm4MoeRMSNorm
65
  source_model_id = "deepseek-ai/DeepSeek-V3.1-Base"
66
  save_folder = "/tmp/yujiepan/deepseek-v3.1-tiny-random"
 
82
  'moe_intermediate_size': 64,
83
  'n_routed_experts': 32,
84
  'n_shared_experts': 1,
85
+ 'num_attention_heads': 4,
86
  'num_experts_per_tok': 8,
87
+ 'num_key_value_heads': 4,
88
  'q_lora_rank': 32,
89
  'qk_nope_head_dim': 64,
90
  'qk_rope_head_dim': 192, # vllm mla kernel supports 576 only, FA supports head dim <= 256
 
172
  (self_attn): DeepseekV3Attention(
173
  (q_a_proj): Linear(in_features=8, out_features=32, bias=False)
174
  (q_a_layernorm): DeepseekV3RMSNorm()
175
+ (q_b_proj): Linear(in_features=32, out_features=1024, bias=False)
176
  (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
177
  (kv_a_layernorm): DeepseekV3RMSNorm()
178
+ (kv_b_proj): Linear(in_features=384, out_features=512, bias=False)
179
+ (o_proj): Linear(in_features=256, out_features=8, bias=False)
180
  (rotary_emb): DeepseekV3YarnRotaryEmbedding()
181
  )
182
  (mlp): DeepseekV3MLP(
 
192
  (self_attn): DeepseekV3Attention(
193
  (q_a_proj): Linear(in_features=8, out_features=32, bias=False)
194
  (q_a_layernorm): DeepseekV3RMSNorm()
195
+ (q_b_proj): Linear(in_features=32, out_features=1024, bias=False)
196
  (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
197
  (kv_a_layernorm): DeepseekV3RMSNorm()
198
+ (kv_b_proj): Linear(in_features=384, out_features=512, bias=False)
199
+ (o_proj): Linear(in_features=256, out_features=8, bias=False)
200
  (rotary_emb): DeepseekV3YarnRotaryEmbedding()
201
  )
202
  (mlp): DeepseekV3MoE(
 
223
  (self_attn): DeepseekV3Attention(
224
  (q_a_proj): Linear(in_features=8, out_features=32, bias=False)
225
  (q_a_layernorm): DeepseekV3RMSNorm()
226
+ (q_b_proj): Linear(in_features=32, out_features=1024, bias=False)
227
  (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
228
  (kv_a_layernorm): DeepseekV3RMSNorm()
229
+ (kv_b_proj): Linear(in_features=384, out_features=512, bias=False)
230
+ (o_proj): Linear(in_features=256, out_features=8, bias=False)
231
  (rotary_emb): DeepseekV3YarnRotaryEmbedding()
232
  )
233
  (mlp): DeepseekV3MoE(
config.json CHANGED
@@ -26,10 +26,10 @@
26
  "n_routed_experts": 32,
27
  "n_shared_experts": 1,
28
  "norm_topk_prob": true,
29
- "num_attention_heads": 1,
30
  "num_experts_per_tok": 8,
31
  "num_hidden_layers": 2,
32
- "num_key_value_heads": 1,
33
  "num_nextn_predict_layers": 1,
34
  "q_lora_rank": 32,
35
  "qk_nope_head_dim": 64,
 
26
  "n_routed_experts": 32,
27
  "n_shared_experts": 1,
28
  "norm_topk_prob": true,
29
+ "num_attention_heads": 4,
30
  "num_experts_per_tok": 8,
31
  "num_hidden_layers": 2,
32
+ "num_key_value_heads": 4,
33
  "num_nextn_predict_layers": 1,
34
  "q_lora_rank": 32,
35
  "qk_nope_head_dim": 64,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac1b518d57a554f76b7e2598e9d5c64bd73fad132184859811de843787ecda34
3
- size 8887136
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d0260eb708d0e3cd43e4b374eb0df160a0032948f72bbec48782bd7aae59e1e
3
+ size 9928552