Upload GptOssForCausalLM

Browse files

Files changed (9) hide show

model-00001-of-00009.safetensors +2 -2
model-00002-of-00009.safetensors +2 -2
model-00003-of-00009.safetensors +2 -2
model-00004-of-00009.safetensors +2 -2
model-00005-of-00009.safetensors +2 -2
model-00006-of-00009.safetensors +2 -2
model-00007-of-00009.safetensors +2 -2
model-00008-of-00009.safetensors +2 -2
model.safetensors.index.json +48 -48

model-00001-of-00009.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4dfc746024f0959f61506acee7e68b73c08abd95086320ca0cf9a00eaad5f8c8
-size 4968986544

 version https://git-lfs.github.com/spec/v1
+oid sha256:7405ebb3dba293e987385b856dc066e0f54864ada27c6c9dedde82cc8052e9ab
+size 4968986592

model-00002-of-00009.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:05f4974c3d0b34ad4a65c9e69943657622038bf3b0295b1ec77378830ae87edc
-size 4972362872

 version https://git-lfs.github.com/spec/v1
+oid sha256:b502e596b87ca4b0b863803100aad4daad937d58b968260c7b0683fe03952703
+size 4972362912

model-00003-of-00009.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3ea6a0d0fd8e16e7fa65ee3fbb9fab5464dac005c3e55a046dcc447b4a0f8d32
-size 4972362872

 version https://git-lfs.github.com/spec/v1
+oid sha256:bace628356d060992f000fef2d19e6ce79e25fb61cd5b37a65443a1a5f9a975c
+size 4972362912

model-00004-of-00009.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3b3a2ca1d60ed8d01021cba531b0907b924371ab0ad105bf0c05696867687dd7
-size 4972363056

 version https://git-lfs.github.com/spec/v1
+oid sha256:d0c19d33dabbf18a9cead94d4591cafde66159fbc2a9b3c537841cf0b2ba06fe
+size 4972363096

model-00005-of-00009.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8ea5d0168556893d32fd3b1bd898fe5dd7f6da39795a982cc4ca32db1c023299
-size 4972363296

 version https://git-lfs.github.com/spec/v1
+oid sha256:eb1ed1de1be0829515b135010615bd5d7f619cd63a9414b9b6195911012a799a
+size 4972363336

model-00006-of-00009.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2e15caab3bf704743d5af3050a83cb105a96cd515bf75f3e0824dca038cf3c94
-size 4972363296

 version https://git-lfs.github.com/spec/v1
+oid sha256:839812f041f34cb0e9e7b2d997da1bdd7831fe62b9ccbea33176d5847b3a232e
+size 4972363336

model-00007-of-00009.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:200657a21c776d0257b7b1e87c9ef63fad5747c97943fb7b8d8e3b8fec8d4d6f
-size 4972363288

 version https://git-lfs.github.com/spec/v1
+oid sha256:7f184d228be8d23bb21749021752120c04308a306e54660435454325567cd422
+size 4972363328

model-00008-of-00009.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4ba35a9058353ac6344d56af61c28112934afd01f5e4183f0ce1159889da6d47
-size 4972363288

 version https://git-lfs.github.com/spec/v1
+oid sha256:9887e040dd583f8f78ff2e0e7ad238b8bb36c1f80633bb09e15d12ea37e0c2a2
+size 4972363328

model.safetensors.index.json CHANGED Viewed

@@ -135,8 +135,8 @@
     "model.layers.0.mlp.experts.gate_up_projs.8.weight": "model-00001-of-00009.safetensors",
     "model.layers.0.mlp.experts.gate_up_projs.9.bias": "model-00001-of-00009.safetensors",
     "model.layers.0.mlp.experts.gate_up_projs.9.weight": "model-00001-of-00009.safetensors",
-    "model.layers.0.mlp.router.bias": "model-00001-of-00009.safetensors",
-    "model.layers.0.mlp.router.weight": "model-00001-of-00009.safetensors",
     "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00009.safetensors",
     "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00009.safetensors",
     "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00009.safetensors",
@@ -276,8 +276,8 @@
     "model.layers.1.mlp.experts.gate_up_projs.8.weight": "model-00001-of-00009.safetensors",
     "model.layers.1.mlp.experts.gate_up_projs.9.bias": "model-00001-of-00009.safetensors",
     "model.layers.1.mlp.experts.gate_up_projs.9.weight": "model-00001-of-00009.safetensors",
-    "model.layers.1.mlp.router.bias": "model-00001-of-00009.safetensors",
-    "model.layers.1.mlp.router.weight": "model-00001-of-00009.safetensors",
     "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00009.safetensors",
     "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00009.safetensors",
     "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00009.safetensors",
@@ -417,8 +417,8 @@
     "model.layers.10.mlp.experts.gate_up_projs.8.weight": "model-00004-of-00009.safetensors",
     "model.layers.10.mlp.experts.gate_up_projs.9.bias": "model-00004-of-00009.safetensors",
     "model.layers.10.mlp.experts.gate_up_projs.9.weight": "model-00004-of-00009.safetensors",
-    "model.layers.10.mlp.router.bias": "model-00004-of-00009.safetensors",
-    "model.layers.10.mlp.router.weight": "model-00004-of-00009.safetensors",
     "model.layers.10.post_attention_layernorm.weight": "model-00004-of-00009.safetensors",
     "model.layers.10.self_attn.k_proj.bias": "model-00004-of-00009.safetensors",
     "model.layers.10.self_attn.k_proj.weight": "model-00004-of-00009.safetensors",
@@ -558,8 +558,8 @@
     "model.layers.11.mlp.experts.gate_up_projs.8.weight": "model-00004-of-00009.safetensors",
     "model.layers.11.mlp.experts.gate_up_projs.9.bias": "model-00004-of-00009.safetensors",
     "model.layers.11.mlp.experts.gate_up_projs.9.weight": "model-00004-of-00009.safetensors",
-    "model.layers.11.mlp.router.bias": "model-00004-of-00009.safetensors",
-    "model.layers.11.mlp.router.weight": "model-00004-of-00009.safetensors",
     "model.layers.11.post_attention_layernorm.weight": "model-00005-of-00009.safetensors",
     "model.layers.11.self_attn.k_proj.bias": "model-00004-of-00009.safetensors",
     "model.layers.11.self_attn.k_proj.weight": "model-00004-of-00009.safetensors",
@@ -699,8 +699,8 @@
     "model.layers.12.mlp.experts.gate_up_projs.8.weight": "model-00005-of-00009.safetensors",
     "model.layers.12.mlp.experts.gate_up_projs.9.bias": "model-00005-of-00009.safetensors",
     "model.layers.12.mlp.experts.gate_up_projs.9.weight": "model-00005-of-00009.safetensors",
-    "model.layers.12.mlp.router.bias": "model-00005-of-00009.safetensors",
-    "model.layers.12.mlp.router.weight": "model-00005-of-00009.safetensors",
     "model.layers.12.post_attention_layernorm.weight": "model-00005-of-00009.safetensors",
     "model.layers.12.self_attn.k_proj.bias": "model-00005-of-00009.safetensors",
     "model.layers.12.self_attn.k_proj.weight": "model-00005-of-00009.safetensors",
@@ -840,8 +840,8 @@
     "model.layers.13.mlp.experts.gate_up_projs.8.weight": "model-00005-of-00009.safetensors",
     "model.layers.13.mlp.experts.gate_up_projs.9.bias": "model-00005-of-00009.safetensors",
     "model.layers.13.mlp.experts.gate_up_projs.9.weight": "model-00005-of-00009.safetensors",
-    "model.layers.13.mlp.router.bias": "model-00005-of-00009.safetensors",
-    "model.layers.13.mlp.router.weight": "model-00005-of-00009.safetensors",
     "model.layers.13.post_attention_layernorm.weight": "model-00005-of-00009.safetensors",
     "model.layers.13.self_attn.k_proj.bias": "model-00005-of-00009.safetensors",
     "model.layers.13.self_attn.k_proj.weight": "model-00005-of-00009.safetensors",
@@ -981,8 +981,8 @@
     "model.layers.14.mlp.experts.gate_up_projs.8.weight": "model-00005-of-00009.safetensors",
     "model.layers.14.mlp.experts.gate_up_projs.9.bias": "model-00005-of-00009.safetensors",
     "model.layers.14.mlp.experts.gate_up_projs.9.weight": "model-00005-of-00009.safetensors",
-    "model.layers.14.mlp.router.bias": "model-00005-of-00009.safetensors",
-    "model.layers.14.mlp.router.weight": "model-00005-of-00009.safetensors",
     "model.layers.14.post_attention_layernorm.weight": "model-00006-of-00009.safetensors",
     "model.layers.14.self_attn.k_proj.bias": "model-00005-of-00009.safetensors",
     "model.layers.14.self_attn.k_proj.weight": "model-00005-of-00009.safetensors",
@@ -1122,8 +1122,8 @@
     "model.layers.15.mlp.experts.gate_up_projs.8.weight": "model-00006-of-00009.safetensors",
     "model.layers.15.mlp.experts.gate_up_projs.9.bias": "model-00006-of-00009.safetensors",
     "model.layers.15.mlp.experts.gate_up_projs.9.weight": "model-00006-of-00009.safetensors",
-    "model.layers.15.mlp.router.bias": "model-00006-of-00009.safetensors",
-    "model.layers.15.mlp.router.weight": "model-00006-of-00009.safetensors",
     "model.layers.15.post_attention_layernorm.weight": "model-00006-of-00009.safetensors",
     "model.layers.15.self_attn.k_proj.bias": "model-00006-of-00009.safetensors",
     "model.layers.15.self_attn.k_proj.weight": "model-00006-of-00009.safetensors",
@@ -1263,8 +1263,8 @@
     "model.layers.16.mlp.experts.gate_up_projs.8.weight": "model-00006-of-00009.safetensors",
     "model.layers.16.mlp.experts.gate_up_projs.9.bias": "model-00006-of-00009.safetensors",
     "model.layers.16.mlp.experts.gate_up_projs.9.weight": "model-00006-of-00009.safetensors",
-    "model.layers.16.mlp.router.bias": "model-00006-of-00009.safetensors",
-    "model.layers.16.mlp.router.weight": "model-00006-of-00009.safetensors",
     "model.layers.16.post_attention_layernorm.weight": "model-00006-of-00009.safetensors",
     "model.layers.16.self_attn.k_proj.bias": "model-00006-of-00009.safetensors",
     "model.layers.16.self_attn.k_proj.weight": "model-00006-of-00009.safetensors",
@@ -1404,8 +1404,8 @@
     "model.layers.17.mlp.experts.gate_up_projs.8.weight": "model-00006-of-00009.safetensors",
     "model.layers.17.mlp.experts.gate_up_projs.9.bias": "model-00006-of-00009.safetensors",
     "model.layers.17.mlp.experts.gate_up_projs.9.weight": "model-00006-of-00009.safetensors",
-    "model.layers.17.mlp.router.bias": "model-00006-of-00009.safetensors",
-    "model.layers.17.mlp.router.weight": "model-00006-of-00009.safetensors",
     "model.layers.17.post_attention_layernorm.weight": "model-00007-of-00009.safetensors",
     "model.layers.17.self_attn.k_proj.bias": "model-00006-of-00009.safetensors",
     "model.layers.17.self_attn.k_proj.weight": "model-00006-of-00009.safetensors",
@@ -1545,8 +1545,8 @@
     "model.layers.18.mlp.experts.gate_up_projs.8.weight": "model-00007-of-00009.safetensors",
     "model.layers.18.mlp.experts.gate_up_projs.9.bias": "model-00007-of-00009.safetensors",
     "model.layers.18.mlp.experts.gate_up_projs.9.weight": "model-00007-of-00009.safetensors",
-    "model.layers.18.mlp.router.bias": "model-00007-of-00009.safetensors",
-    "model.layers.18.mlp.router.weight": "model-00007-of-00009.safetensors",
     "model.layers.18.post_attention_layernorm.weight": "model-00007-of-00009.safetensors",
     "model.layers.18.self_attn.k_proj.bias": "model-00007-of-00009.safetensors",
     "model.layers.18.self_attn.k_proj.weight": "model-00007-of-00009.safetensors",
@@ -1686,8 +1686,8 @@
     "model.layers.19.mlp.experts.gate_up_projs.8.weight": "model-00007-of-00009.safetensors",
     "model.layers.19.mlp.experts.gate_up_projs.9.bias": "model-00007-of-00009.safetensors",
     "model.layers.19.mlp.experts.gate_up_projs.9.weight": "model-00007-of-00009.safetensors",
-    "model.layers.19.mlp.router.bias": "model-00007-of-00009.safetensors",
-    "model.layers.19.mlp.router.weight": "model-00007-of-00009.safetensors",
     "model.layers.19.post_attention_layernorm.weight": "model-00007-of-00009.safetensors",
     "model.layers.19.self_attn.k_proj.bias": "model-00007-of-00009.safetensors",
     "model.layers.19.self_attn.k_proj.weight": "model-00007-of-00009.safetensors",
@@ -1827,8 +1827,8 @@
     "model.layers.2.mlp.experts.gate_up_projs.8.weight": "model-00001-of-00009.safetensors",
     "model.layers.2.mlp.experts.gate_up_projs.9.bias": "model-00001-of-00009.safetensors",
     "model.layers.2.mlp.experts.gate_up_projs.9.weight": "model-00001-of-00009.safetensors",
-    "model.layers.2.mlp.router.bias": "model-00001-of-00009.safetensors",
-    "model.layers.2.mlp.router.weight": "model-00001-of-00009.safetensors",
     "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00009.safetensors",
     "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00009.safetensors",
     "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00009.safetensors",
@@ -1968,8 +1968,8 @@
     "model.layers.20.mlp.experts.gate_up_projs.8.weight": "model-00007-of-00009.safetensors",
     "model.layers.20.mlp.experts.gate_up_projs.9.bias": "model-00007-of-00009.safetensors",
     "model.layers.20.mlp.experts.gate_up_projs.9.weight": "model-00007-of-00009.safetensors",
-    "model.layers.20.mlp.router.bias": "model-00007-of-00009.safetensors",
-    "model.layers.20.mlp.router.weight": "model-00007-of-00009.safetensors",
     "model.layers.20.post_attention_layernorm.weight": "model-00008-of-00009.safetensors",
     "model.layers.20.self_attn.k_proj.bias": "model-00007-of-00009.safetensors",
     "model.layers.20.self_attn.k_proj.weight": "model-00007-of-00009.safetensors",
@@ -2109,8 +2109,8 @@
     "model.layers.21.mlp.experts.gate_up_projs.8.weight": "model-00008-of-00009.safetensors",
     "model.layers.21.mlp.experts.gate_up_projs.9.bias": "model-00008-of-00009.safetensors",
     "model.layers.21.mlp.experts.gate_up_projs.9.weight": "model-00008-of-00009.safetensors",
-    "model.layers.21.mlp.router.bias": "model-00008-of-00009.safetensors",
-    "model.layers.21.mlp.router.weight": "model-00008-of-00009.safetensors",
     "model.layers.21.post_attention_layernorm.weight": "model-00008-of-00009.safetensors",
     "model.layers.21.self_attn.k_proj.bias": "model-00008-of-00009.safetensors",
     "model.layers.21.self_attn.k_proj.weight": "model-00008-of-00009.safetensors",
@@ -2250,8 +2250,8 @@
     "model.layers.22.mlp.experts.gate_up_projs.8.weight": "model-00008-of-00009.safetensors",
     "model.layers.22.mlp.experts.gate_up_projs.9.bias": "model-00008-of-00009.safetensors",
     "model.layers.22.mlp.experts.gate_up_projs.9.weight": "model-00008-of-00009.safetensors",
-    "model.layers.22.mlp.router.bias": "model-00008-of-00009.safetensors",
-    "model.layers.22.mlp.router.weight": "model-00008-of-00009.safetensors",
     "model.layers.22.post_attention_layernorm.weight": "model-00008-of-00009.safetensors",
     "model.layers.22.self_attn.k_proj.bias": "model-00008-of-00009.safetensors",
     "model.layers.22.self_attn.k_proj.weight": "model-00008-of-00009.safetensors",
@@ -2391,8 +2391,8 @@
     "model.layers.23.mlp.experts.gate_up_projs.8.weight": "model-00008-of-00009.safetensors",
     "model.layers.23.mlp.experts.gate_up_projs.9.bias": "model-00008-of-00009.safetensors",
     "model.layers.23.mlp.experts.gate_up_projs.9.weight": "model-00008-of-00009.safetensors",
-    "model.layers.23.mlp.router.bias": "model-00008-of-00009.safetensors",
-    "model.layers.23.mlp.router.weight": "model-00008-of-00009.safetensors",
     "model.layers.23.post_attention_layernorm.weight": "model-00009-of-00009.safetensors",
     "model.layers.23.self_attn.k_proj.bias": "model-00008-of-00009.safetensors",
     "model.layers.23.self_attn.k_proj.weight": "model-00008-of-00009.safetensors",
@@ -2532,8 +2532,8 @@
     "model.layers.3.mlp.experts.gate_up_projs.8.weight": "model-00002-of-00009.safetensors",
     "model.layers.3.mlp.experts.gate_up_projs.9.bias": "model-00002-of-00009.safetensors",
     "model.layers.3.mlp.experts.gate_up_projs.9.weight": "model-00002-of-00009.safetensors",
-    "model.layers.3.mlp.router.bias": "model-00002-of-00009.safetensors",
-    "model.layers.3.mlp.router.weight": "model-00002-of-00009.safetensors",
     "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00009.safetensors",
     "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00009.safetensors",
     "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00009.safetensors",
@@ -2673,8 +2673,8 @@
     "model.layers.4.mlp.experts.gate_up_projs.8.weight": "model-00002-of-00009.safetensors",
     "model.layers.4.mlp.experts.gate_up_projs.9.bias": "model-00002-of-00009.safetensors",
     "model.layers.4.mlp.experts.gate_up_projs.9.weight": "model-00002-of-00009.safetensors",
-    "model.layers.4.mlp.router.bias": "model-00002-of-00009.safetensors",
-    "model.layers.4.mlp.router.weight": "model-00002-of-00009.safetensors",
     "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00009.safetensors",
     "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00009.safetensors",
     "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00009.safetensors",
@@ -2814,8 +2814,8 @@
     "model.layers.5.mlp.experts.gate_up_projs.8.weight": "model-00002-of-00009.safetensors",
     "model.layers.5.mlp.experts.gate_up_projs.9.bias": "model-00002-of-00009.safetensors",
     "model.layers.5.mlp.experts.gate_up_projs.9.weight": "model-00002-of-00009.safetensors",
-    "model.layers.5.mlp.router.bias": "model-00002-of-00009.safetensors",
-    "model.layers.5.mlp.router.weight": "model-00002-of-00009.safetensors",
     "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00009.safetensors",
     "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00009.safetensors",
     "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00009.safetensors",
@@ -2955,8 +2955,8 @@
     "model.layers.6.mlp.experts.gate_up_projs.8.weight": "model-00003-of-00009.safetensors",
     "model.layers.6.mlp.experts.gate_up_projs.9.bias": "model-00003-of-00009.safetensors",
     "model.layers.6.mlp.experts.gate_up_projs.9.weight": "model-00003-of-00009.safetensors",
-    "model.layers.6.mlp.router.bias": "model-00003-of-00009.safetensors",
-    "model.layers.6.mlp.router.weight": "model-00003-of-00009.safetensors",
     "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00009.safetensors",
     "model.layers.6.self_attn.k_proj.bias": "model-00003-of-00009.safetensors",
     "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00009.safetensors",
@@ -3096,8 +3096,8 @@
     "model.layers.7.mlp.experts.gate_up_projs.8.weight": "model-00003-of-00009.safetensors",
     "model.layers.7.mlp.experts.gate_up_projs.9.bias": "model-00003-of-00009.safetensors",
     "model.layers.7.mlp.experts.gate_up_projs.9.weight": "model-00003-of-00009.safetensors",
-    "model.layers.7.mlp.router.bias": "model-00003-of-00009.safetensors",
-    "model.layers.7.mlp.router.weight": "model-00003-of-00009.safetensors",
     "model.layers.7.post_attention_layernorm.weight": "model-00003-of-00009.safetensors",
     "model.layers.7.self_attn.k_proj.bias": "model-00003-of-00009.safetensors",
     "model.layers.7.self_attn.k_proj.weight": "model-00003-of-00009.safetensors",
@@ -3237,8 +3237,8 @@
     "model.layers.8.mlp.experts.gate_up_projs.8.weight": "model-00003-of-00009.safetensors",
     "model.layers.8.mlp.experts.gate_up_projs.9.bias": "model-00003-of-00009.safetensors",
     "model.layers.8.mlp.experts.gate_up_projs.9.weight": "model-00003-of-00009.safetensors",
-    "model.layers.8.mlp.router.bias": "model-00003-of-00009.safetensors",
-    "model.layers.8.mlp.router.weight": "model-00003-of-00009.safetensors",
     "model.layers.8.post_attention_layernorm.weight": "model-00004-of-00009.safetensors",
     "model.layers.8.self_attn.k_proj.bias": "model-00003-of-00009.safetensors",
     "model.layers.8.self_attn.k_proj.weight": "model-00003-of-00009.safetensors",
@@ -3378,8 +3378,8 @@
     "model.layers.9.mlp.experts.gate_up_projs.8.weight": "model-00004-of-00009.safetensors",
     "model.layers.9.mlp.experts.gate_up_projs.9.bias": "model-00004-of-00009.safetensors",
     "model.layers.9.mlp.experts.gate_up_projs.9.weight": "model-00004-of-00009.safetensors",
-    "model.layers.9.mlp.router.bias": "model-00004-of-00009.safetensors",
-    "model.layers.9.mlp.router.weight": "model-00004-of-00009.safetensors",
     "model.layers.9.post_attention_layernorm.weight": "model-00004-of-00009.safetensors",
     "model.layers.9.self_attn.k_proj.bias": "model-00004-of-00009.safetensors",
     "model.layers.9.self_attn.k_proj.weight": "model-00004-of-00009.safetensors",

     "model.layers.0.mlp.experts.gate_up_projs.8.weight": "model-00001-of-00009.safetensors",
     "model.layers.0.mlp.experts.gate_up_projs.9.bias": "model-00001-of-00009.safetensors",
     "model.layers.0.mlp.experts.gate_up_projs.9.weight": "model-00001-of-00009.safetensors",
+    "model.layers.0.mlp.router.linear.bias": "model-00001-of-00009.safetensors",
+    "model.layers.0.mlp.router.linear.weight": "model-00001-of-00009.safetensors",
     "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00009.safetensors",
     "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00009.safetensors",
     "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00009.safetensors",
     "model.layers.1.mlp.experts.gate_up_projs.8.weight": "model-00001-of-00009.safetensors",
     "model.layers.1.mlp.experts.gate_up_projs.9.bias": "model-00001-of-00009.safetensors",
     "model.layers.1.mlp.experts.gate_up_projs.9.weight": "model-00001-of-00009.safetensors",
+    "model.layers.1.mlp.router.linear.bias": "model-00001-of-00009.safetensors",
+    "model.layers.1.mlp.router.linear.weight": "model-00001-of-00009.safetensors",
     "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00009.safetensors",
     "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00009.safetensors",
     "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00009.safetensors",
     "model.layers.10.mlp.experts.gate_up_projs.8.weight": "model-00004-of-00009.safetensors",
     "model.layers.10.mlp.experts.gate_up_projs.9.bias": "model-00004-of-00009.safetensors",
     "model.layers.10.mlp.experts.gate_up_projs.9.weight": "model-00004-of-00009.safetensors",
+    "model.layers.10.mlp.router.linear.bias": "model-00004-of-00009.safetensors",
+    "model.layers.10.mlp.router.linear.weight": "model-00004-of-00009.safetensors",
     "model.layers.10.post_attention_layernorm.weight": "model-00004-of-00009.safetensors",
     "model.layers.10.self_attn.k_proj.bias": "model-00004-of-00009.safetensors",
     "model.layers.10.self_attn.k_proj.weight": "model-00004-of-00009.safetensors",
     "model.layers.11.mlp.experts.gate_up_projs.8.weight": "model-00004-of-00009.safetensors",
     "model.layers.11.mlp.experts.gate_up_projs.9.bias": "model-00004-of-00009.safetensors",
     "model.layers.11.mlp.experts.gate_up_projs.9.weight": "model-00004-of-00009.safetensors",
+    "model.layers.11.mlp.router.linear.bias": "model-00004-of-00009.safetensors",
+    "model.layers.11.mlp.router.linear.weight": "model-00004-of-00009.safetensors",
     "model.layers.11.post_attention_layernorm.weight": "model-00005-of-00009.safetensors",
     "model.layers.11.self_attn.k_proj.bias": "model-00004-of-00009.safetensors",
     "model.layers.11.self_attn.k_proj.weight": "model-00004-of-00009.safetensors",
     "model.layers.12.mlp.experts.gate_up_projs.8.weight": "model-00005-of-00009.safetensors",
     "model.layers.12.mlp.experts.gate_up_projs.9.bias": "model-00005-of-00009.safetensors",
     "model.layers.12.mlp.experts.gate_up_projs.9.weight": "model-00005-of-00009.safetensors",
+    "model.layers.12.mlp.router.linear.bias": "model-00005-of-00009.safetensors",
+    "model.layers.12.mlp.router.linear.weight": "model-00005-of-00009.safetensors",
     "model.layers.12.post_attention_layernorm.weight": "model-00005-of-00009.safetensors",
     "model.layers.12.self_attn.k_proj.bias": "model-00005-of-00009.safetensors",
     "model.layers.12.self_attn.k_proj.weight": "model-00005-of-00009.safetensors",
     "model.layers.13.mlp.experts.gate_up_projs.8.weight": "model-00005-of-00009.safetensors",
     "model.layers.13.mlp.experts.gate_up_projs.9.bias": "model-00005-of-00009.safetensors",
     "model.layers.13.mlp.experts.gate_up_projs.9.weight": "model-00005-of-00009.safetensors",
+    "model.layers.13.mlp.router.linear.bias": "model-00005-of-00009.safetensors",
+    "model.layers.13.mlp.router.linear.weight": "model-00005-of-00009.safetensors",
     "model.layers.13.post_attention_layernorm.weight": "model-00005-of-00009.safetensors",
     "model.layers.13.self_attn.k_proj.bias": "model-00005-of-00009.safetensors",
     "model.layers.13.self_attn.k_proj.weight": "model-00005-of-00009.safetensors",
     "model.layers.14.mlp.experts.gate_up_projs.8.weight": "model-00005-of-00009.safetensors",
     "model.layers.14.mlp.experts.gate_up_projs.9.bias": "model-00005-of-00009.safetensors",
     "model.layers.14.mlp.experts.gate_up_projs.9.weight": "model-00005-of-00009.safetensors",
+    "model.layers.14.mlp.router.linear.bias": "model-00005-of-00009.safetensors",
+    "model.layers.14.mlp.router.linear.weight": "model-00005-of-00009.safetensors",
     "model.layers.14.post_attention_layernorm.weight": "model-00006-of-00009.safetensors",
     "model.layers.14.self_attn.k_proj.bias": "model-00005-of-00009.safetensors",
     "model.layers.14.self_attn.k_proj.weight": "model-00005-of-00009.safetensors",
     "model.layers.15.mlp.experts.gate_up_projs.8.weight": "model-00006-of-00009.safetensors",
     "model.layers.15.mlp.experts.gate_up_projs.9.bias": "model-00006-of-00009.safetensors",
     "model.layers.15.mlp.experts.gate_up_projs.9.weight": "model-00006-of-00009.safetensors",
+    "model.layers.15.mlp.router.linear.bias": "model-00006-of-00009.safetensors",
+    "model.layers.15.mlp.router.linear.weight": "model-00006-of-00009.safetensors",
     "model.layers.15.post_attention_layernorm.weight": "model-00006-of-00009.safetensors",
     "model.layers.15.self_attn.k_proj.bias": "model-00006-of-00009.safetensors",
     "model.layers.15.self_attn.k_proj.weight": "model-00006-of-00009.safetensors",
     "model.layers.16.mlp.experts.gate_up_projs.8.weight": "model-00006-of-00009.safetensors",
     "model.layers.16.mlp.experts.gate_up_projs.9.bias": "model-00006-of-00009.safetensors",
     "model.layers.16.mlp.experts.gate_up_projs.9.weight": "model-00006-of-00009.safetensors",
+    "model.layers.16.mlp.router.linear.bias": "model-00006-of-00009.safetensors",
+    "model.layers.16.mlp.router.linear.weight": "model-00006-of-00009.safetensors",
     "model.layers.16.post_attention_layernorm.weight": "model-00006-of-00009.safetensors",
     "model.layers.16.self_attn.k_proj.bias": "model-00006-of-00009.safetensors",
     "model.layers.16.self_attn.k_proj.weight": "model-00006-of-00009.safetensors",
     "model.layers.17.mlp.experts.gate_up_projs.8.weight": "model-00006-of-00009.safetensors",
     "model.layers.17.mlp.experts.gate_up_projs.9.bias": "model-00006-of-00009.safetensors",
     "model.layers.17.mlp.experts.gate_up_projs.9.weight": "model-00006-of-00009.safetensors",
+    "model.layers.17.mlp.router.linear.bias": "model-00006-of-00009.safetensors",
+    "model.layers.17.mlp.router.linear.weight": "model-00006-of-00009.safetensors",
     "model.layers.17.post_attention_layernorm.weight": "model-00007-of-00009.safetensors",
     "model.layers.17.self_attn.k_proj.bias": "model-00006-of-00009.safetensors",
     "model.layers.17.self_attn.k_proj.weight": "model-00006-of-00009.safetensors",
     "model.layers.18.mlp.experts.gate_up_projs.8.weight": "model-00007-of-00009.safetensors",
     "model.layers.18.mlp.experts.gate_up_projs.9.bias": "model-00007-of-00009.safetensors",
     "model.layers.18.mlp.experts.gate_up_projs.9.weight": "model-00007-of-00009.safetensors",
+    "model.layers.18.mlp.router.linear.bias": "model-00007-of-00009.safetensors",
+    "model.layers.18.mlp.router.linear.weight": "model-00007-of-00009.safetensors",
     "model.layers.18.post_attention_layernorm.weight": "model-00007-of-00009.safetensors",
     "model.layers.18.self_attn.k_proj.bias": "model-00007-of-00009.safetensors",
     "model.layers.18.self_attn.k_proj.weight": "model-00007-of-00009.safetensors",
     "model.layers.19.mlp.experts.gate_up_projs.8.weight": "model-00007-of-00009.safetensors",
     "model.layers.19.mlp.experts.gate_up_projs.9.bias": "model-00007-of-00009.safetensors",
     "model.layers.19.mlp.experts.gate_up_projs.9.weight": "model-00007-of-00009.safetensors",
+    "model.layers.19.mlp.router.linear.bias": "model-00007-of-00009.safetensors",
+    "model.layers.19.mlp.router.linear.weight": "model-00007-of-00009.safetensors",
     "model.layers.19.post_attention_layernorm.weight": "model-00007-of-00009.safetensors",
     "model.layers.19.self_attn.k_proj.bias": "model-00007-of-00009.safetensors",
     "model.layers.19.self_attn.k_proj.weight": "model-00007-of-00009.safetensors",
     "model.layers.2.mlp.experts.gate_up_projs.8.weight": "model-00001-of-00009.safetensors",
     "model.layers.2.mlp.experts.gate_up_projs.9.bias": "model-00001-of-00009.safetensors",
     "model.layers.2.mlp.experts.gate_up_projs.9.weight": "model-00001-of-00009.safetensors",
+    "model.layers.2.mlp.router.linear.bias": "model-00001-of-00009.safetensors",
+    "model.layers.2.mlp.router.linear.weight": "model-00001-of-00009.safetensors",
     "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00009.safetensors",
     "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00009.safetensors",
     "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00009.safetensors",
     "model.layers.20.mlp.experts.gate_up_projs.8.weight": "model-00007-of-00009.safetensors",
     "model.layers.20.mlp.experts.gate_up_projs.9.bias": "model-00007-of-00009.safetensors",
     "model.layers.20.mlp.experts.gate_up_projs.9.weight": "model-00007-of-00009.safetensors",
+    "model.layers.20.mlp.router.linear.bias": "model-00007-of-00009.safetensors",
+    "model.layers.20.mlp.router.linear.weight": "model-00007-of-00009.safetensors",
     "model.layers.20.post_attention_layernorm.weight": "model-00008-of-00009.safetensors",
     "model.layers.20.self_attn.k_proj.bias": "model-00007-of-00009.safetensors",
     "model.layers.20.self_attn.k_proj.weight": "model-00007-of-00009.safetensors",
     "model.layers.21.mlp.experts.gate_up_projs.8.weight": "model-00008-of-00009.safetensors",
     "model.layers.21.mlp.experts.gate_up_projs.9.bias": "model-00008-of-00009.safetensors",
     "model.layers.21.mlp.experts.gate_up_projs.9.weight": "model-00008-of-00009.safetensors",
+    "model.layers.21.mlp.router.linear.bias": "model-00008-of-00009.safetensors",
+    "model.layers.21.mlp.router.linear.weight": "model-00008-of-00009.safetensors",
     "model.layers.21.post_attention_layernorm.weight": "model-00008-of-00009.safetensors",
     "model.layers.21.self_attn.k_proj.bias": "model-00008-of-00009.safetensors",
     "model.layers.21.self_attn.k_proj.weight": "model-00008-of-00009.safetensors",
     "model.layers.22.mlp.experts.gate_up_projs.8.weight": "model-00008-of-00009.safetensors",
     "model.layers.22.mlp.experts.gate_up_projs.9.bias": "model-00008-of-00009.safetensors",
     "model.layers.22.mlp.experts.gate_up_projs.9.weight": "model-00008-of-00009.safetensors",
+    "model.layers.22.mlp.router.linear.bias": "model-00008-of-00009.safetensors",
+    "model.layers.22.mlp.router.linear.weight": "model-00008-of-00009.safetensors",
     "model.layers.22.post_attention_layernorm.weight": "model-00008-of-00009.safetensors",
     "model.layers.22.self_attn.k_proj.bias": "model-00008-of-00009.safetensors",
     "model.layers.22.self_attn.k_proj.weight": "model-00008-of-00009.safetensors",
     "model.layers.23.mlp.experts.gate_up_projs.8.weight": "model-00008-of-00009.safetensors",
     "model.layers.23.mlp.experts.gate_up_projs.9.bias": "model-00008-of-00009.safetensors",
     "model.layers.23.mlp.experts.gate_up_projs.9.weight": "model-00008-of-00009.safetensors",
+    "model.layers.23.mlp.router.linear.bias": "model-00008-of-00009.safetensors",
+    "model.layers.23.mlp.router.linear.weight": "model-00008-of-00009.safetensors",
     "model.layers.23.post_attention_layernorm.weight": "model-00009-of-00009.safetensors",
     "model.layers.23.self_attn.k_proj.bias": "model-00008-of-00009.safetensors",
     "model.layers.23.self_attn.k_proj.weight": "model-00008-of-00009.safetensors",
     "model.layers.3.mlp.experts.gate_up_projs.8.weight": "model-00002-of-00009.safetensors",
     "model.layers.3.mlp.experts.gate_up_projs.9.bias": "model-00002-of-00009.safetensors",
     "model.layers.3.mlp.experts.gate_up_projs.9.weight": "model-00002-of-00009.safetensors",
+    "model.layers.3.mlp.router.linear.bias": "model-00002-of-00009.safetensors",
+    "model.layers.3.mlp.router.linear.weight": "model-00002-of-00009.safetensors",
     "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00009.safetensors",
     "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00009.safetensors",
     "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00009.safetensors",
     "model.layers.4.mlp.experts.gate_up_projs.8.weight": "model-00002-of-00009.safetensors",
     "model.layers.4.mlp.experts.gate_up_projs.9.bias": "model-00002-of-00009.safetensors",
     "model.layers.4.mlp.experts.gate_up_projs.9.weight": "model-00002-of-00009.safetensors",
+    "model.layers.4.mlp.router.linear.bias": "model-00002-of-00009.safetensors",
+    "model.layers.4.mlp.router.linear.weight": "model-00002-of-00009.safetensors",
     "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00009.safetensors",
     "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00009.safetensors",
     "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00009.safetensors",
     "model.layers.5.mlp.experts.gate_up_projs.8.weight": "model-00002-of-00009.safetensors",
     "model.layers.5.mlp.experts.gate_up_projs.9.bias": "model-00002-of-00009.safetensors",
     "model.layers.5.mlp.experts.gate_up_projs.9.weight": "model-00002-of-00009.safetensors",
+    "model.layers.5.mlp.router.linear.bias": "model-00002-of-00009.safetensors",
+    "model.layers.5.mlp.router.linear.weight": "model-00002-of-00009.safetensors",
     "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00009.safetensors",
     "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00009.safetensors",
     "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00009.safetensors",
     "model.layers.6.mlp.experts.gate_up_projs.8.weight": "model-00003-of-00009.safetensors",
     "model.layers.6.mlp.experts.gate_up_projs.9.bias": "model-00003-of-00009.safetensors",
     "model.layers.6.mlp.experts.gate_up_projs.9.weight": "model-00003-of-00009.safetensors",
+    "model.layers.6.mlp.router.linear.bias": "model-00003-of-00009.safetensors",
+    "model.layers.6.mlp.router.linear.weight": "model-00003-of-00009.safetensors",
     "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00009.safetensors",
     "model.layers.6.self_attn.k_proj.bias": "model-00003-of-00009.safetensors",
     "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00009.safetensors",
     "model.layers.7.mlp.experts.gate_up_projs.8.weight": "model-00003-of-00009.safetensors",
     "model.layers.7.mlp.experts.gate_up_projs.9.bias": "model-00003-of-00009.safetensors",
     "model.layers.7.mlp.experts.gate_up_projs.9.weight": "model-00003-of-00009.safetensors",
+    "model.layers.7.mlp.router.linear.bias": "model-00003-of-00009.safetensors",
+    "model.layers.7.mlp.router.linear.weight": "model-00003-of-00009.safetensors",
     "model.layers.7.post_attention_layernorm.weight": "model-00003-of-00009.safetensors",
     "model.layers.7.self_attn.k_proj.bias": "model-00003-of-00009.safetensors",
     "model.layers.7.self_attn.k_proj.weight": "model-00003-of-00009.safetensors",
     "model.layers.8.mlp.experts.gate_up_projs.8.weight": "model-00003-of-00009.safetensors",
     "model.layers.8.mlp.experts.gate_up_projs.9.bias": "model-00003-of-00009.safetensors",
     "model.layers.8.mlp.experts.gate_up_projs.9.weight": "model-00003-of-00009.safetensors",
+    "model.layers.8.mlp.router.linear.bias": "model-00003-of-00009.safetensors",
+    "model.layers.8.mlp.router.linear.weight": "model-00003-of-00009.safetensors",
     "model.layers.8.post_attention_layernorm.weight": "model-00004-of-00009.safetensors",
     "model.layers.8.self_attn.k_proj.bias": "model-00003-of-00009.safetensors",
     "model.layers.8.self_attn.k_proj.weight": "model-00003-of-00009.safetensors",
     "model.layers.9.mlp.experts.gate_up_projs.8.weight": "model-00004-of-00009.safetensors",
     "model.layers.9.mlp.experts.gate_up_projs.9.bias": "model-00004-of-00009.safetensors",
     "model.layers.9.mlp.experts.gate_up_projs.9.weight": "model-00004-of-00009.safetensors",
+    "model.layers.9.mlp.router.linear.bias": "model-00004-of-00009.safetensors",
+    "model.layers.9.mlp.router.linear.weight": "model-00004-of-00009.safetensors",
     "model.layers.9.post_attention_layernorm.weight": "model-00004-of-00009.safetensors",
     "model.layers.9.self_attn.k_proj.bias": "model-00004-of-00009.safetensors",
     "model.layers.9.self_attn.k_proj.weight": "model-00004-of-00009.safetensors",