lakshmi97
/

Phi-4-multimodal-instruct

@@ -944,38 +944,37 @@ class Phi4MMLongRoPEScaledRotaryEmbedding(Phi4MMRotaryEmbedding):
         self.short_factor = config.rope_scaling["short_factor"]
         self.long_factor = config.rope_scaling["long_factor"]
         self.original_max_position_embeddings = config.original_max_position_embeddings
     @torch.no_grad()
     def forward(self, x, position_ids, seq_len=None):
-        seq_len = seq_len or torch.max(position_ids) + 1
         if seq_len > self.original_max_position_embeddings:
-            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
         else:
-            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
-        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim
-        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 since bfloat16 loses precision on long contexts
-        # See https://github.com/huggingface/transformers/pull/29285
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
         with torch.autocast(device_type=device_type, enabled=False):
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            scale = self.max_position_embeddings / self.original_max_position_embeddings
-            if scale <= 1.0:
-                scaling_factor = 1.0
-            else:
-                scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
-            cos = emb.cos() * scaling_factor
-            sin = emb.sin() * scaling_factor
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 # Copied from transformers.models.llama.modeling_llama.rotate_half

         self.short_factor = config.rope_scaling["short_factor"]
         self.long_factor = config.rope_scaling["long_factor"]
         self.original_max_position_embeddings = config.original_max_position_embeddings
+        self.long_inv_freq_expanded=self.seq_freq(self.long_factor)
+        self.short_inv_freq_expanded=self.seq_freq(self.short_factor)
+        scale = self.max_position_embeddings / self.original_max_position_embeddings
+        if scale > 1.0:
+            self.scaling_factor=math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
+        else:
+            self.scaling_factor=1.0
+    def seq_freq(self, factor):
+        ext_factors = torch.tensor(factor, dtype=torch.float32, device='cuda')
+        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device='cuda').float() / self.dim
+        inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
+        inv_freq_expanded = inv_freq[None, :, None].float()
+        return inv_freq_expanded
+    ########## INIT FUNCTION COMPUTES VARIABLES #####################
     @torch.no_grad()
     def forward(self, x, position_ids, seq_len=None):
+        device_type = x.device.type
         if seq_len > self.original_max_position_embeddings:
+            inv_freq_expanded = self.long_inv_freq_expanded.expand(position_ids.shape[0], -1, 1).to(device_type)
         else:
+            inv_freq_expanded = self.short_inv_freq_expanded.expand(position_ids.shape[0], -1, 1).to(device_type)
         position_ids_expanded = position_ids[:, None, :].float()
         with torch.autocast(device_type=device_type, enabled=False):
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.scaling_factor
+            sin = emb.sin() * self.scaling_factor
 # Copied from transformers.models.llama.modeling_llama.rotate_half