[hotfix] update ffn dim
Browse files- configuration_grok1.py +2 -2
- modeling_grok1.py +3 -4
configuration_grok1.py
CHANGED
|
@@ -9,7 +9,7 @@ class Grok1Config(PretrainedConfig):
|
|
| 9 |
self,
|
| 10 |
vocab_size=32000,
|
| 11 |
hidden_size=4096,
|
| 12 |
-
|
| 13 |
num_hidden_layers=32,
|
| 14 |
num_attention_heads=32,
|
| 15 |
num_key_value_heads=32,
|
|
@@ -37,7 +37,7 @@ class Grok1Config(PretrainedConfig):
|
|
| 37 |
self.embedding_multiplier_scale = embedding_multiplier_scale
|
| 38 |
self.output_multiplier_scale = output_multiplier_scale
|
| 39 |
self.hidden_size = hidden_size
|
| 40 |
-
self.
|
| 41 |
self.num_hidden_layers = num_hidden_layers
|
| 42 |
self.num_attention_heads = num_attention_heads
|
| 43 |
|
|
|
|
| 9 |
self,
|
| 10 |
vocab_size=32000,
|
| 11 |
hidden_size=4096,
|
| 12 |
+
intermediate_size=32768,
|
| 13 |
num_hidden_layers=32,
|
| 14 |
num_attention_heads=32,
|
| 15 |
num_key_value_heads=32,
|
|
|
|
| 37 |
self.embedding_multiplier_scale = embedding_multiplier_scale
|
| 38 |
self.output_multiplier_scale = output_multiplier_scale
|
| 39 |
self.hidden_size = hidden_size
|
| 40 |
+
self.intermediate_size = intermediate_size
|
| 41 |
self.num_hidden_layers = num_hidden_layers
|
| 42 |
self.num_attention_heads = num_attention_heads
|
| 43 |
|
modeling_grok1.py
CHANGED
|
@@ -395,11 +395,11 @@ class DecoderLayer(nn.Module):
|
|
| 395 |
def __init__(
|
| 396 |
self,
|
| 397 |
hidden_size: int,
|
|
|
|
| 398 |
num_heads: int,
|
| 399 |
num_key_value_heads: int,
|
| 400 |
num_experts: int,
|
| 401 |
top_k: int,
|
| 402 |
-
widening_factor: float = 4.0,
|
| 403 |
max_position_embeddings: int = 2048,
|
| 404 |
attn_output_multiplier: float = 1.0,
|
| 405 |
max_attn_val: float = 30.0,
|
|
@@ -414,8 +414,7 @@ class DecoderLayer(nn.Module):
|
|
| 414 |
attn_output_multiplier=attn_output_multiplier,
|
| 415 |
max_attn_val=max_attn_val,
|
| 416 |
)
|
| 417 |
-
|
| 418 |
-
self.moe_block = MoeBlock(hidden_size, ffn_dim, num_experts, top_k)
|
| 419 |
self.pre_attn_norm = RMSNorm(hidden_size, eps=rms_norm_eps)
|
| 420 |
self.post_attn_norm = RMSNorm(hidden_size, eps=rms_norm_eps)
|
| 421 |
self.pre_moe_norm = RMSNorm(hidden_size, eps=rms_norm_eps)
|
|
@@ -543,11 +542,11 @@ class Grok1Model(Grok1PretrainedModel):
|
|
| 543 |
[
|
| 544 |
DecoderLayer(
|
| 545 |
hidden_size=config.hidden_size,
|
|
|
|
| 546 |
num_heads=config.num_attention_heads,
|
| 547 |
num_key_value_heads=config.num_key_value_heads,
|
| 548 |
num_experts=config.num_experts,
|
| 549 |
top_k=config.num_experts_per_tok,
|
| 550 |
-
widening_factor=config.widening_factor,
|
| 551 |
max_position_embeddings=config.max_position_embeddings,
|
| 552 |
attn_output_multiplier=config.attn_output_multiplier,
|
| 553 |
max_attn_val=config.max_attn_value,
|
|
|
|
| 395 |
def __init__(
|
| 396 |
self,
|
| 397 |
hidden_size: int,
|
| 398 |
+
intermediate_size: int,
|
| 399 |
num_heads: int,
|
| 400 |
num_key_value_heads: int,
|
| 401 |
num_experts: int,
|
| 402 |
top_k: int,
|
|
|
|
| 403 |
max_position_embeddings: int = 2048,
|
| 404 |
attn_output_multiplier: float = 1.0,
|
| 405 |
max_attn_val: float = 30.0,
|
|
|
|
| 414 |
attn_output_multiplier=attn_output_multiplier,
|
| 415 |
max_attn_val=max_attn_val,
|
| 416 |
)
|
| 417 |
+
self.moe_block = MoeBlock(hidden_size, intermediate_size, num_experts, top_k)
|
|
|
|
| 418 |
self.pre_attn_norm = RMSNorm(hidden_size, eps=rms_norm_eps)
|
| 419 |
self.post_attn_norm = RMSNorm(hidden_size, eps=rms_norm_eps)
|
| 420 |
self.pre_moe_norm = RMSNorm(hidden_size, eps=rms_norm_eps)
|
|
|
|
| 542 |
[
|
| 543 |
DecoderLayer(
|
| 544 |
hidden_size=config.hidden_size,
|
| 545 |
+
intermediate_size=config.intermediate_size,
|
| 546 |
num_heads=config.num_attention_heads,
|
| 547 |
num_key_value_heads=config.num_key_value_heads,
|
| 548 |
num_experts=config.num_experts,
|
| 549 |
top_k=config.num_experts_per_tok,
|
|
|
|
| 550 |
max_position_embeddings=config.max_position_embeddings,
|
| 551 |
attn_output_multiplier=config.attn_output_multiplier,
|
| 552 |
max_attn_val=config.max_attn_value,
|