AIDC-AI
/

Ovis2.5-9B

@@ -4,8 +4,6 @@ from typing import Dict, List, Optional, Tuple, Union
 import PIL.Image
 import numpy as np
 import torch
-from flash_attn import flash_attn_varlen_func
-from flash_attn.layers.rotary import apply_rotary_emb
 from torch import Tensor, nn
 from torch.nn import functional as F
 from transformers import (
@@ -19,9 +17,16 @@ from transformers.activations import ACT2FN
 from transformers.generation.utils import GenerateOutput
 from transformers.modeling_outputs import BaseModelOutputWithNoAttention
 from transformers.modeling_utils import PreTrainedModel
 from .configuration_ovis2_5 import Siglip2NavitConfig, Ovis2_5_Config
 IMAGE_PLACEHOLDER = "<image>"
 IMAGE_PLACEHOLDER_ID = -200
 VIDEO_PLACEHOLDER = "<video>"
@@ -30,6 +35,7 @@ VIDEO_PLACEHOLDER_ID = -201
 VISUAL_ATOM_ID = -300
 INDICATOR_IDS = [-301, -302, -303, -304]
 # copied from qwen2.5-vl
 class VisionRotaryEmbedding(nn.Module):
     def __init__(self, dim: int, theta: float = 10000.0) -> None:
@@ -86,7 +92,6 @@ class Siglip2VisionEmbeddings(nn.Module):
     ) -> torch.Tensor:
         """
         Resize positional embeddings to image-specific size and pad to a fixed size.
         Args:
             positional_embeddings (`torch.Tensor`):
                 Position embeddings of shape (height, width, embed_dim)
@@ -94,7 +99,6 @@ class Siglip2VisionEmbeddings(nn.Module):
                 Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
             max_length (`int`):
                 Maximum length of the positional embeddings to pad resized positional embeddings to
         Returns:
             `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
         """
@@ -193,6 +197,28 @@ def apply_rotary_pos_emb_flashatt(
     return q_embed, k_embed
 class Siglip2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
@@ -238,14 +264,41 @@ class Siglip2Attention(nn.Module):
         if self.use_rope:
             cos, sin = position_embeddings
-            queries, keys = apply_rotary_pos_emb_flashatt(queries.unsqueeze(0), keys.unsqueeze(0), cos, sin)
             queries = queries.squeeze(0)
             keys = keys.squeeze(0)
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        attn_output = flash_attn_varlen_func(queries, keys, values, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
-                                            seq_length, -1
-                                        )
         attn_output = self.out_proj(attn_output)
         return attn_output
@@ -310,7 +363,6 @@ class Siglip2Encoder(nn.Module):
     """
     Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
     [`Siglip2EncoderLayer`].
     Args:
         config: Siglip2NavitConfig
     """
@@ -415,10 +467,8 @@ class Siglip2Encoder(nn.Module):
                 than the model's internal embedding lookup matrix.
             attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                 - 1 for tokens that are **not masked**,
                 - 0 for tokens that are **masked**.
                 [What are attention masks?](../glossary#attention-mask)
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
@@ -946,4 +996,4 @@ class Ovis2_5(OvisPreTrainedModel):
 AutoConfig.register('siglip2_navit', Siglip2NavitConfig)
 AutoModel.register(Siglip2NavitConfig, Siglip2NavitModel)
 AutoConfig.register("ovis2_5", Ovis2_5_Config)
-AutoModelForCausalLM.register(Ovis2_5_Config, Ovis2_5)

 import PIL.Image
 import numpy as np
 import torch
 from torch import Tensor, nn
 from torch.nn import functional as F
 from transformers import (
 from transformers.generation.utils import GenerateOutput
 from transformers.modeling_outputs import BaseModelOutputWithNoAttention
 from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import is_flash_attn_2_available
 from .configuration_ovis2_5 import Siglip2NavitConfig, Ovis2_5_Config
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_varlen_func
+    from flash_attn.layers.rotary import apply_rotary_emb
 IMAGE_PLACEHOLDER = "<image>"
 IMAGE_PLACEHOLDER_ID = -200
 VIDEO_PLACEHOLDER = "<video>"
 VISUAL_ATOM_ID = -300
 INDICATOR_IDS = [-301, -302, -303, -304]
 # copied from qwen2.5-vl
 class VisionRotaryEmbedding(nn.Module):
     def __init__(self, dim: int, theta: float = 10000.0) -> None:
     ) -> torch.Tensor:
         """
         Resize positional embeddings to image-specific size and pad to a fixed size.
         Args:
             positional_embeddings (`torch.Tensor`):
                 Position embeddings of shape (height, width, embed_dim)
                 Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
             max_length (`int`):
                 Maximum length of the positional embeddings to pad resized positional embeddings to
         Returns:
             `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
         """
     return q_embed, k_embed
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb_vision(
+    q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    orig_q_dtype = q.dtype
+    orig_k_dtype = k.dtype
+    q, k = q.float(), k.float()
+    cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float()
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    q_embed = q_embed.to(orig_q_dtype)
+    k_embed = k_embed.to(orig_k_dtype)
+    return q_embed, k_embed
 class Siglip2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
         if self.use_rope:
             cos, sin = position_embeddings
+            if is_flash_attn_2_available():
+                queries, keys = apply_rotary_pos_emb_flashatt(queries.unsqueeze(0), keys.unsqueeze(0), cos, sin)
+            else:
+                queries, keys = apply_rotary_pos_emb_vision(queries.unsqueeze(0), keys.unsqueeze(0), cos, sin)
             queries = queries.squeeze(0)
             keys = keys.squeeze(0)
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        if is_flash_attn_2_available():
+            attn_output = flash_attn_varlen_func(queries, keys, values, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
+                                                seq_length, -1
+                                            )
+        else:
+            batch_size = cu_seqlens.shape[0] - 1
+            outputs = []
+            cu = cu_seqlens.tolist()
+            for i in range(batch_size):
+                start_idx = cu[i]
+                end_idx = cu[i + 1]
+                # Each sequence is processed independently.
+                q_i = queries[start_idx:end_idx].unsqueeze(0)
+                k_i = keys[start_idx:end_idx].unsqueeze(0)
+                v_i = values[start_idx:end_idx].unsqueeze(0)
+                # (1, seq_len, num_heads, head_dim) ->
+                # (1, num_heads, seq_len, head_dim)
+                q_i, k_i, v_i = [x.transpose(1, 2) for x in (q_i, k_i, v_i)]
+                output_i = F.scaled_dot_product_attention(q_i,
+                                                        k_i,
+                                                        v_i,
+                                                        dropout_p=0.0)
+                # (1, num_heads, seq_len, head_dim) -> (seq_len, embed_dim)
+                output_i = output_i.transpose(1, 2).reshape(-1, self.embed_dim)
+                outputs.append(output_i)
+            attn_output = torch.cat(outputs, dim=0)
         attn_output = self.out_proj(attn_output)
         return attn_output
     """
     Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
     [`Siglip2EncoderLayer`].
     Args:
         config: Siglip2NavitConfig
     """
                 than the model's internal embedding lookup matrix.
             attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                 - 1 for tokens that are **not masked**,
                 - 0 for tokens that are **masked**.
                 [What are attention masks?](../glossary#attention-mask)
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
 AutoConfig.register('siglip2_navit', Siglip2NavitConfig)
 AutoModel.register(Siglip2NavitConfig, Siglip2NavitModel)
 AutoConfig.register("ovis2_5", Ovis2_5_Config)
+AutoModelForCausalLM.register(Ovis2_5_Config, Ovis2_5)