fix-glu-mlp (#17)

Files changed (2) hide show

mlp.py CHANGED Viewed

@@ -33,6 +33,7 @@ class GLUMLP(nn.Module):
             in_features,
             hidden_features,
             activation,
             return_residual=False,
             hidden_dropout_prob=0.1
     ):
@@ -52,14 +53,19 @@ class GLUMLP(nn.Module):
         self.wo = nn.Linear(hidden_features, in_features)
         self.dropout = nn.Dropout(hidden_dropout_prob)
         self.return_residual = return_residual
         #self.layernorm = nn.LayerNorm(in_features, eps=layer_norm_eps)
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         residual_connection = hidden_states
         # compute the activation
         hidden_states = self.gated_layers(hidden_states)
-        gated = hidden_states[:, : self.hidden_features]
-        non_gated = hidden_states[:, self.hidden_features :]
         hidden_states = self.act(gated) * non_gated
         hidden_states = self.dropout(hidden_states)
         # multiply by the second matrix

             in_features,
             hidden_features,
             activation,
+            use_flash_attn,
             return_residual=False,
             hidden_dropout_prob=0.1
     ):
         self.wo = nn.Linear(hidden_features, in_features)
         self.dropout = nn.Dropout(hidden_dropout_prob)
         self.return_residual = return_residual
+        self.use_flash_attn = use_flash_attn
         #self.layernorm = nn.LayerNorm(in_features, eps=layer_norm_eps)
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         residual_connection = hidden_states
         # compute the activation
         hidden_states = self.gated_layers(hidden_states)
+        if self.use_flash_attn:
+            gated = hidden_states[:, : self.hidden_features]
+            non_gated = hidden_states[:, self.hidden_features :]
+        else:
+            gated = hidden_states[:, :, : self.hidden_features]
+            non_gated = hidden_states[:, :, self.hidden_features :]
         hidden_states = self.act(gated) * non_gated
         hidden_states = self.dropout(hidden_states)
         # multiply by the second matrix

modeling_bert.py CHANGED Viewed

@@ -114,6 +114,7 @@ def create_mlp_cls(config, layer_idx=None, return_residual=False):
             GLUMLP,
             hidden_features=inner_dim,
             activation=config.hidden_act,
             hidden_dropout_prob=config.hidden_dropout_prob,
             return_residual=return_residual,
         )
@@ -802,4 +803,4 @@ class BertForMaskedLM(BertPreTrainedModel):
             loss=masked_lm_loss,
             prediction_logits=prediction_scores,
             seq_relationship_logits=seq_relationship_score,
-        )

             GLUMLP,
             hidden_features=inner_dim,
             activation=config.hidden_act,
+            use_flash_attn=config.use_flash_attn,
             hidden_dropout_prob=config.hidden_dropout_prob,
             return_residual=return_residual,
         )
             loss=masked_lm_loss,
             prediction_logits=prediction_scores,
             seq_relationship_logits=seq_relationship_score,
+        )