SmallDoge
/

Doge-20M-Instruct

@@ -3,7 +3,7 @@
 #
 # This code is based on the Wonderful Matrices paper implementation.
 #
-#     https://arxiv.org/abs/2407.16958
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

 #
 # This code is based on the Wonderful Matrices paper implementation.
 #
+#     https://arxiv.org/abs/2412.11834
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

modeling_doge.py CHANGED Viewed

@@ -3,7 +3,7 @@
 #
 # This code is based on the Wonderful Matrices paper implementation.
 #
-#     https://arxiv.org/abs/2407.16958
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -184,6 +184,7 @@ def apply_QK_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
 class DogeDynamicMaskAttention(nn.Module):
     def __init__(self, config: DogeConfig, layer_idx: Optional[int] = None):
         super().__init__()
@@ -387,6 +388,7 @@ class DogeMLP(nn.Module):
 class DogeCDMoE(DogeMLP):
     def __init__(self, config: DogeConfig):
         super().__init__(config)
@@ -816,7 +818,7 @@ class DogeModel(DogePreTrainedModel):
             )
         # in case the provided `attention` mask is 2D, we generate a causal mask here (4D).
-        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position_and_dynamic_mask(
             attention_mask=attention_mask,
             sequence_length=sequence_length,
             target_length=target_length,
@@ -829,7 +831,7 @@ class DogeModel(DogePreTrainedModel):
         return causal_mask
     @staticmethod
-    def _prepare_4d_causal_attention_mask_with_cache_position_and_dynamic_mask(
         attention_mask: torch.Tensor = None,
         sequence_length: int = None,
         target_length: int = None,
@@ -875,7 +877,6 @@ class DogeModel(DogePreTrainedModel):
             causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
-                # print(f"attention_mask: {attention_mask.shape}")
                 causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                 mask_length = attention_mask.shape[-1]
                 padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]

 #
 # This code is based on the Wonderful Matrices paper implementation.
 #
+#     https://arxiv.org/abs/2412.11834
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 class DogeDynamicMaskAttention(nn.Module):
+    """Dynamic Mask Attention from 'Wonderful Matrices' paper."""
     def __init__(self, config: DogeConfig, layer_idx: Optional[int] = None):
         super().__init__()
 class DogeCDMoE(DogeMLP):
+    """Cross Domain Mixture of Experts from 'Wonderful Matrices' paper."""
     def __init__(self, config: DogeConfig):
         super().__init__(config)
             )
         # in case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
             attention_mask=attention_mask,
             sequence_length=sequence_length,
             target_length=target_length,
         return causal_mask
     @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
         attention_mask: torch.Tensor = None,
         sequence_length: int = None,
         target_length: int = None,
             causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
             causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
             if attention_mask is not None:
                 causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                 mask_length = attention_mask.shape[-1]
                 padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]