tomg-group-umd
/

huginn-0125

@@ -23,6 +23,16 @@ import torch.nn.functional as F
 from transformers import GenerationConfig
 class RavenPreTrainedModel(PreTrainedModel):
     config_class = RavenConfig
     base_model_prefix = "model"
@@ -37,18 +47,9 @@ class RavenPreTrainedModel(PreTrainedModel):
     _supports_static_cache = True
     _tp_plan = {}
-    @cache
-    def _init_func(self, dim, num_layers):
-        return {
-            "std": math.sqrt(2 / (5 * dim)),
-            "out_proj": math.sqrt(2 / (5 * dim)) / math.sqrt(2 * num_layers),
-            "embedding": math.sqrt(2 / (5 * dim)),
-            "embed_scale": math.sqrt(dim),
-        }
     @property
     def emb_scale(self):
-        return self._init_func(self.config.n_embd, self.config.effective_expected_depth)["embed_scale"]
     def _normal_(self, tensor, std):
         return torch.nn.init.trunc_normal_(tensor, mean=0.0, std=std, a=-3 * std, b=3 * std)
@@ -86,7 +87,7 @@ class RavenPreTrainedModel(PreTrainedModel):
     @torch.no_grad()
     def _init_weights(self, module):
-        _init_values = self._init_func(self.config.n_embd, self.config.effective_expected_depth)
         name = self._full_name_of_module_lookup[id(module)]
         if isinstance(module, RMSNorm):
             torch.nn.init.ones_(module.weight)
@@ -703,14 +704,14 @@ class RavenForCausalLM(RavenPreTrainedModel, GenerationMixin):
             loss = torch.nn.functional.cross_entropy(
                 logits.view(-1, logits.shape[-1]), labels.view(-1), ignore_index=-100
             )
-            log_ppl = loss.clone().detach().exp()
         else:
             logits = self.lm_head(x).float()
             loss, log_ppl = torch.as_tensor(0.0), torch.as_tensor(0.0)
         return CausalLMOutputRecurrentLatents(
             loss=loss,
-            log_ppl=log_ppl,
             logits=logits if output_details["return_logits"] else None,
             past_key_values=past_key_values,
             hidden_states=x if output_details["return_head"] else None,

 from transformers import GenerationConfig
+@cache
+def _init_func(dim, num_layers) -> dict[str, float]:
+    return {
+        "std": math.sqrt(2 / (5 * dim)),
+        "out_proj": math.sqrt(2 / (5 * dim)) / math.sqrt(2 * num_layers),
+        "embedding": math.sqrt(2 / (5 * dim)),
+        "embed_scale": math.sqrt(dim),
+    }
 class RavenPreTrainedModel(PreTrainedModel):
     config_class = RavenConfig
     base_model_prefix = "model"
     _supports_static_cache = True
     _tp_plan = {}
     @property
     def emb_scale(self):
+        return _init_func(self.config.n_embd, self.config.effective_expected_depth)["embed_scale"]
     def _normal_(self, tensor, std):
         return torch.nn.init.trunc_normal_(tensor, mean=0.0, std=std, a=-3 * std, b=3 * std)
     @torch.no_grad()
     def _init_weights(self, module):
+        _init_values = _init_func(self.config.n_embd, self.config.effective_expected_depth)
         name = self._full_name_of_module_lookup[id(module)]
         if isinstance(module, RMSNorm):
             torch.nn.init.ones_(module.weight)
             loss = torch.nn.functional.cross_entropy(
                 logits.view(-1, logits.shape[-1]), labels.view(-1), ignore_index=-100
             )
+            log_ppl = loss.clone().detach()
         else:
             logits = self.lm_head(x).float()
             loss, log_ppl = torch.as_tensor(0.0), torch.as_tensor(0.0)
         return CausalLMOutputRecurrentLatents(
             loss=loss,
+            log_ppl=log_ppl,  # this value is returned only for compatibility reasons. For this model loss=log-ppl
             logits=logits if output_details["return_logits"] else None,
             past_key_values=past_key_values,
             hidden_states=x if output_details["return_head"] else None,