Upload GPTRefactForCausalLM
Browse files- config.json +1 -1
- modeling_gpt_refact.py +9 -13
- pytorch_model.bin +2 -2
config.json
CHANGED
|
@@ -20,7 +20,7 @@
|
|
| 20 |
"n_layer": 32,
|
| 21 |
"n_positions": 4096,
|
| 22 |
"scale_attention_softmax_in_fp32": true,
|
| 23 |
-
"torch_dtype": "
|
| 24 |
"transformers_version": "4.31.0",
|
| 25 |
"use_cache": true,
|
| 26 |
"vocab_size": 49216
|
|
|
|
| 20 |
"n_layer": 32,
|
| 21 |
"n_positions": 4096,
|
| 22 |
"scale_attention_softmax_in_fp32": true,
|
| 23 |
+
"torch_dtype": "bfloat16",
|
| 24 |
"transformers_version": "4.31.0",
|
| 25 |
"use_cache": true,
|
| 26 |
"vocab_size": 49216
|
modeling_gpt_refact.py
CHANGED
|
@@ -101,7 +101,6 @@ def get_alibi_biases(
|
|
| 101 |
# Multiply them pair-wise to get the AliBi bias matrix
|
| 102 |
biases = distance[:, :, None] * m[None, None, :]
|
| 103 |
biases = biases.permute(2, 0, 1)[None, :, :T, :T]
|
| 104 |
-
biases = biases.repeat(B, 1, 1, 1)
|
| 105 |
return biases.contiguous()
|
| 106 |
|
| 107 |
|
|
@@ -132,8 +131,7 @@ class Attention(nn.Module):
|
|
| 132 |
self.attention_bias_in_fp32 = config.attention_bias_in_fp32
|
| 133 |
|
| 134 |
self.q = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
|
| 135 |
-
self.
|
| 136 |
-
self.v = nn.Linear(self.embed_dim, self.head_dim, bias=False)
|
| 137 |
self.c_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
|
| 138 |
|
| 139 |
def _get_mask_value(self, device, dtype):
|
|
@@ -200,8 +198,8 @@ class Attention(nn.Module):
|
|
| 200 |
Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
|
| 201 |
]:
|
| 202 |
query = self.q(hidden_states)
|
| 203 |
-
|
| 204 |
-
value = self.
|
| 205 |
|
| 206 |
if layer_past is not None:
|
| 207 |
past_key, past_value = layer_past
|
|
@@ -231,15 +229,14 @@ class MLP(nn.Module):
|
|
| 231 |
embed_dim = config.hidden_size
|
| 232 |
hidden_dim = intermediate_size
|
| 233 |
hidden_dim = int(2 * hidden_dim / 3)
|
| 234 |
-
hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
|
| 235 |
-
self.
|
| 236 |
-
self.
|
| 237 |
-
self.c_proj = nn.Linear(hidden_dim, embed_dim, bias=False)
|
| 238 |
|
| 239 |
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 240 |
-
|
| 241 |
-
x2 = self.
|
| 242 |
-
x = self.c_proj(x1 * x2)
|
| 243 |
return x
|
| 244 |
|
| 245 |
|
|
@@ -264,7 +261,6 @@ class GPTRefactBlock(nn.Module):
|
|
| 264 |
self.ln_1 = LayerNormNoBias(hidden_size, eps=config.layer_norm_epsilon)
|
| 265 |
self.attn = Attention(config, layer_idx=layer_idx)
|
| 266 |
self.ln_2 = LayerNormNoBias(hidden_size, eps=config.layer_norm_epsilon)
|
| 267 |
-
|
| 268 |
self.mlp = MLP(self.inner_dim, config)
|
| 269 |
|
| 270 |
def forward(
|
|
|
|
| 101 |
# Multiply them pair-wise to get the AliBi bias matrix
|
| 102 |
biases = distance[:, :, None] * m[None, None, :]
|
| 103 |
biases = biases.permute(2, 0, 1)[None, :, :T, :T]
|
|
|
|
| 104 |
return biases.contiguous()
|
| 105 |
|
| 106 |
|
|
|
|
| 131 |
self.attention_bias_in_fp32 = config.attention_bias_in_fp32
|
| 132 |
|
| 133 |
self.q = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
|
| 134 |
+
self.kv = nn.Linear(self.embed_dim, self.head_dim * 2, bias=False)
|
|
|
|
| 135 |
self.c_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
|
| 136 |
|
| 137 |
def _get_mask_value(self, device, dtype):
|
|
|
|
| 198 |
Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
|
| 199 |
]:
|
| 200 |
query = self.q(hidden_states)
|
| 201 |
+
kv = self.kv(hidden_states)
|
| 202 |
+
key, value = kv.split(self.head_dim, dim=-1)
|
| 203 |
|
| 204 |
if layer_past is not None:
|
| 205 |
past_key, past_value = layer_past
|
|
|
|
| 229 |
embed_dim = config.hidden_size
|
| 230 |
hidden_dim = intermediate_size
|
| 231 |
hidden_dim = int(2 * hidden_dim / 3)
|
| 232 |
+
self.hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
|
| 233 |
+
self.gate_up_proj = nn.Linear(embed_dim, self.hidden_dim * 2, bias=False)
|
| 234 |
+
self.c_proj = nn.Linear(self.hidden_dim, embed_dim, bias=False)
|
|
|
|
| 235 |
|
| 236 |
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 237 |
+
up_proj = self.gate_up_proj(x)
|
| 238 |
+
x1, x2 = torch.split(up_proj, self.hidden_dim, dim=-1)
|
| 239 |
+
x = self.c_proj(F.silu(x1) * x2)
|
| 240 |
return x
|
| 241 |
|
| 242 |
|
|
|
|
| 261 |
self.ln_1 = LayerNormNoBias(hidden_size, eps=config.layer_norm_epsilon)
|
| 262 |
self.attn = Attention(config, layer_idx=layer_idx)
|
| 263 |
self.ln_2 = LayerNormNoBias(hidden_size, eps=config.layer_norm_epsilon)
|
|
|
|
| 264 |
self.mlp = MLP(self.inner_dim, config)
|
| 265 |
|
| 266 |
def forward(
|
pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6bf4dc20907069119671fdaf9f7b79d0260cd36ab94626f4af4fdd5a157d0205
|
| 3 |
+
size 3171755929
|