inclusionAI
/

Ling-lite

@@ -207,6 +207,90 @@ class BailingMoeDynamicNTKScalingRotaryEmbedding(BailingMoeRotaryEmbedding):
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
 # Copied from transformers.models.llama.modeling_llama.rotate_half
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
@@ -278,7 +362,7 @@ class BailingMoeGate(nn.Module):
         init.kaiming_uniform_(self.weight, a=math.sqrt(5))
-    def forward(self, hidden_states):
         bsz, seq_len, h = hidden_states.shape
         # compute gating score
         hidden_states = hidden_states.view(-1, h)
@@ -286,7 +370,7 @@ class BailingMoeGate(nn.Module):
         scores = logits.softmax(dim=-1, dtype=torch.float32)
         # select top-k experts
-        topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
         # norm gate to sum 1
         if self.top_k > 1 and self.norm_topk_prob:
@@ -305,7 +389,7 @@ class BailingMoeSparseMoeBlock(nn.Module):
         super().__init__()
         self.config = config
         self.num_experts_per_tok = config.num_experts_per_tok
-        self.experts = self._setup_experts()
         self.gate = BailingMoeGate(config)
         if config.num_shared_experts is not None:
             self.shared_experts = BailingMoeMLP(
@@ -313,7 +397,7 @@ class BailingMoeSparseMoeBlock(nn.Module):
             )
     def _setup_experts(self):
-        return nn.ModuleList(
             [
                 BailingMoeMLP(config=self.config, intermediate_size=self.config.moe_intermediate_size)
                 for _ in range(self.config.num_experts)
@@ -443,6 +527,25 @@ class BailingMoeAttention(nn.Module):
                     scaling_factor=scaling_factor,
                     base=self.rope_theta,
                 )
             else:
                 raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
@@ -1258,6 +1361,24 @@ class BailingMoeForCausalLM(BailingMoePreTrainedModel):
     def get_decoder(self):
         return self.model
     @add_start_docstrings_to_model_forward(BAILINGMOE_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -1325,22 +1446,7 @@ class BailingMoeForCausalLM(BailingMoePreTrainedModel):
         hidden_states = outputs[0]
-        if self.norm_head:
-            if self.training:
-                norm_weight = (
-                    self.lm_head.weight / (torch.norm(self.lm_head.weight, p=2, dim=0, keepdim=True) + 1e-7).detach()
-                )
-                logits = F.linear(hidden_states, norm_weight, None)
-            else:
-                self.lm_head.weight.data = (
-                    self.lm_head.weight.data.float()
-                    / (torch.norm(self.lm_head.weight.data.float(), p=2, dim=0, keepdim=True) + 1e-7)
-                ).to(hidden_states.dtype)
-                logits = F.linear(hidden_states, self.lm_head.weight.data, None)
-                self.norm_head = False
-        else:
-            logits = self.lm_head(hidden_states)
         logits = logits.float()
         loss = None
@@ -1392,8 +1498,7 @@ class BailingMoeForCausalLM(BailingMoePreTrainedModel):
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
-            # input)
             if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                 input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
             # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard

         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+# Inverse dim formula to find dim based on number of rotations
+def yarn_find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048):
+    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
+# Find dim range bounds based on rotations
+def yarn_find_correction_range(low_rot, high_rot, dim, base=10000, max_position_embeddings=2048):
+    low = math.floor(yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings))
+    high = math.ceil(yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings))
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+def yarn_get_mscale(scale=1, mscale=1):
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+def yarn_linear_ramp_mask(min, max, dim):
+    if min == max:
+        max += 0.001  # Prevent singularity
+    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+class BailingMoeYarnRotaryEmbedding(BailingMoeRotaryEmbedding):
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+        original_max_position_embeddings=4096,
+        beta_fast=32,
+        beta_slow=1,
+        mscale=1,
+        mscale_all_dim=0,
+    ):
+        self.scaling_factor = scaling_factor
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        self.mscale = mscale
+        self.mscale_all_dim = mscale_all_dim
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        dim = self.dim
+        freq_extra = 1.0 / (self.base ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim))
+        freq_inter = 1.0 / (
+            self.scaling_factor * self.base ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
+        )
+        low, high = yarn_find_correction_range(
+            self.beta_fast,
+            self.beta_slow,
+            dim,
+            self.base,
+            self.original_max_position_embeddings,
+        )
+        inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to(device=device, dtype=torch.float32)
+        inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(seq_len, device=device, dtype=torch.float32)
+        freqs = torch.outer(t, inv_freq)
+        _mscale = float(
+            yarn_get_mscale(self.scaling_factor, self.mscale)
+            / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim)
+        )
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", (emb.cos() * _mscale).to(dtype), persistent=False)
+        self.register_buffer("sin_cached", (emb.sin() * _mscale).to(dtype), persistent=False)
 # Copied from transformers.models.llama.modeling_llama.rotate_half
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
         init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+    def forward(self, hidden_states, sort=False):
         bsz, seq_len, h = hidden_states.shape
         # compute gating score
         hidden_states = hidden_states.view(-1, h)
         scores = logits.softmax(dim=-1, dtype=torch.float32)
         # select top-k experts
+        topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=sort)
         # norm gate to sum 1
         if self.top_k > 1 and self.norm_topk_prob:
         super().__init__()
         self.config = config
         self.num_experts_per_tok = config.num_experts_per_tok
+        self._setup_experts()
         self.gate = BailingMoeGate(config)
         if config.num_shared_experts is not None:
             self.shared_experts = BailingMoeMLP(
             )
     def _setup_experts(self):
+        self.experts = nn.ModuleList(
             [
                 BailingMoeMLP(config=self.config, intermediate_size=self.config.moe_intermediate_size)
                 for _ in range(self.config.num_experts)
                     scaling_factor=scaling_factor,
                     base=self.rope_theta,
                 )
+            elif scaling_type == "yarn":
+                kwargs = {
+                    key: self.config.rope_scaling[key]
+                    for key in [
+                        "original_max_position_embeddings",
+                        "beta_fast",
+                        "beta_slow",
+                        "mscale",
+                        "mscale_all_dim",
+                    ]
+                    if key in self.config.rope_scaling
+                }
+                self.rotary_emb = BailingMoeYarnRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                    **kwargs,
+                )
             else:
                 raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
     def get_decoder(self):
         return self.model
+    def compute_logit(self, hidden_states):
+        if self.norm_head:
+            if self.training:
+                norm_weight = (
+                    self.lm_head.weight / (torch.norm(self.lm_head.weight, p=2, dim=0, keepdim=True) + 1e-7).detach()
+                )
+                logits = F.linear(hidden_states, norm_weight, None)
+            else:
+                self.lm_head.weight.data = (
+                    self.lm_head.weight.data.float()
+                    / (torch.norm(self.lm_head.weight.data.float(), p=2, dim=0, keepdim=True) + 1e-7)
+                ).to(hidden_states.dtype)
+                logits = F.linear(hidden_states, self.lm_head.weight.data, None)
+                self.norm_head = False
+        else:
+            logits = self.lm_head(hidden_states)
+        return logits
     @add_start_docstrings_to_model_forward(BAILINGMOE_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         hidden_states = outputs[0]
+        logits = self.compute_logit(hidden_states=hidden_states)
         logits = logits.float()
         loss = None
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as input)
             if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                 input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
             # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard