Birchlabs
/

mosaicml-mpt-7b-chat-qlora

@@ -316,7 +316,7 @@ class MultiheadAttention(nn.Module, Attn):
                         False, # multiquery
                     )
                 return custom_forward
-            attn_out: AttnOutput = checkpoint(
                 create_custom_forward(self.attn_fn),
                 query,
                 key,
@@ -332,7 +332,7 @@ class MultiheadAttention(nn.Module, Attn):
                 **ckpt_kwargs,
             )
         else:
-            attn_out: AttnOutput = self.attn_fn(
                 query,
                 key,
                 value,
@@ -345,7 +345,7 @@ class MultiheadAttention(nn.Module, Attn):
                 training=self.training,
                 needs_weights=needs_weights,
             )
-        context, attn_weights = attn_out
         return AttnOutput(self.out_proj(context), attn_weights, past_key_value)
 class MultiQueryAttention(nn.Module, Attn):
@@ -413,8 +413,67 @@ class MultiQueryAttention(nn.Module, Attn):
             past_key_value = PastKeyValue(key, value)
         if attn_bias is not None:
             attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
-        attn_fn_output: AttnFnOutput = self.attn_fn(query, key, value, self.n_heads, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights, multiquery=True)
-        context, attn_weights = attn_fn_output
         return AttnOutput(self.out_proj(context), attn_weights, past_key_value)
 def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id):

                         False, # multiquery
                     )
                 return custom_forward
+            attn_fn_out: AttnFnOutput = checkpoint(
                 create_custom_forward(self.attn_fn),
                 query,
                 key,
                 **ckpt_kwargs,
             )
         else:
+            attn_fn_out: AttnFnOutput = self.attn_fn(
                 query,
                 key,
                 value,
                 training=self.training,
                 needs_weights=needs_weights,
             )
+        context, attn_weights = attn_fn_out
         return AttnOutput(self.out_proj(context), attn_weights, past_key_value)
 class MultiQueryAttention(nn.Module, Attn):
             past_key_value = PastKeyValue(key, value)
         if attn_bias is not None:
             attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
+        if self.training and self.gradient_checkpointing:
+            ckpt_kwargs: Dict[str, Any] = {'use_reentrant': False} if is_torch_version('>=', '1.11.0') else {}
+            def create_custom_forward(attn_fn: AttnFn) -> AttnFnCheckpointed:
+                def custom_forward(
+                    query: torch.Tensor,
+                    key: torch.Tensor,
+                    value: torch.Tensor,
+                    n_heads: int,
+                    softmax_scale: Optional[float],
+                    attn_bias: Optional[torch.Tensor],
+                    key_padding_mask: Optional[torch.ByteTensor],
+                    is_causal: bool,
+                    dropout_p: float,
+                    training: bool,
+                    needs_weights: bool,
+                ):
+                    return attn_fn(
+                        query,
+                        key,
+                        value,
+                        n_heads,
+                        softmax_scale,
+                        attn_bias,
+                        key_padding_mask,
+                        is_causal,
+                        dropout_p,
+                        training,
+                        needs_weights,
+                        True, # multiquery
+                    )
+                return custom_forward
+            attn_fn_out: AttnFnOutput = checkpoint(
+                create_custom_forward(self.attn_fn),
+                query,
+                key,
+                value,
+                self.n_heads,
+                self.softmax_scale,
+                attn_bias,
+                key_padding_mask,
+                is_causal,
+                self.attn_dropout_p,
+                self.training,
+                needs_weights,
+                **ckpt_kwargs,
+            )
+        else:
+            attn_fn_out: AttnFnOutput = self.attn_fn(
+                query,
+                key,
+                value,
+                self.n_heads,
+                softmax_scale=self.softmax_scale,
+                attn_bias=attn_bias,
+                key_padding_mask=key_padding_mask,
+                is_causal=is_causal,
+                dropout_p=self.attn_dropout_p,
+                training=self.training,
+                needs_weights=needs_weights,
+            )
+        context, attn_weights = attn_fn_out
         return AttnOutput(self.out_proj(context), attn_weights, past_key_value)
 def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id):