Alibaba-NLP
/

gme-Qwen2-VL-2B-Instruct

@@ -1,8 +1,10 @@
 {
   "_name_or_path": "gme-Qwen2-VL-2B-Instruct",
-  "architectures": [
-    "Qwen2VLForConditionalGeneration"
-  ],
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
   "eos_token_id": 151645,
@@ -13,17 +15,13 @@
   "intermediate_size": 8960,
   "max_position_embeddings": 32768,
   "max_window_layers": 28,
-  "model_type": "qwen2_vl",
   "num_attention_heads": 12,
   "num_hidden_layers": 28,
   "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
   "rope_scaling": {
-    "mrope_section": [
-      16,
-      24,
-      24
-    ],
     "type": "mrope"
   },
   "rope_theta": 1000000.0,

 {
   "_name_or_path": "gme-Qwen2-VL-2B-Instruct",
+  "architectures": ["GmeQwen2VLForVision2Seq"],
+  "auto_map": {
+    "AutoModel": "gme_inference.GmeQwen2VLForVision2Seq",
+    "AutoConfig": "gme_inference.GmeQwen2VLConfig"
+  },
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
   "eos_token_id": 151645,
   "intermediate_size": 8960,
   "max_position_embeddings": 32768,
   "max_window_layers": 28,
+  "model_type": "gme_qwen2_vl",
   "num_attention_heads": 12,
   "num_hidden_layers": 28,
   "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-6,
   "rope_scaling": {
+    "mrope_section": [16, 24, 24],
     "type": "mrope"
   },
   "rope_theta": 1000000.0,

gme_inference.py CHANGED Viewed

@@ -1,45 +1,79 @@
 from __future__ import annotations
 import logging
 import math
 import os
-from typing import Dict, List, Optional
 import torch
 from PIL import Image
 from torch.utils.data import DataLoader
 from tqdm.autonotebook import tqdm
-from transformers import AutoModelForVision2Seq, AutoProcessor
-class GmeQwen2VL:
     def __init__(
         self,
-        model_name: str = "Alibaba-NLP/gme-Qwen2-VL-2B-Instruct",
-        model_path: Optional[str] = None,
         device: str = "cuda" if torch.cuda.is_available() else "cpu",
-        min_image_tokens=256,
-        max_image_tokens=1280,
-        max_length=1800,
-        **kwargs,
     ) -> None:
-        model_name = model_path or model_name
-        self.base = AutoModelForVision2Seq.from_pretrained(
-            model_name, torch_dtype=torch.float16, **kwargs
-        )
-        self.base.eval()
-        self.normalize = True
-        self.device = device
-        min_pixels = min_image_tokens * 28 * 28
-        max_pixels = max_image_tokens * 28 * 28
         self.max_length = max_length
         self.processor = AutoProcessor.from_pretrained(
             model_name, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs
         )
-        self.processor.tokenizer.padding_side = 'right'
-        self.defualt_instruction = 'You are a helpful assistant.'
-        self.sep = ' '
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -48,11 +82,9 @@ class GmeQwen2VL:
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         pixel_values: Optional[torch.Tensor] = None,
-        # pixel_values_videos: Optional[torch.FloatTensor] = None,
         image_grid_thw: Optional[torch.LongTensor] = None,
-        # video_grid_thw: Optional[torch.LongTensor] = None,
         pooling_mask: Optional[torch.LongTensor] = None,
-        **kwargs
     ) -> torch.Tensor:
         if inputs_embeds is None:
             inputs_embeds = self.base.model.embed_tokens(input_ids)
@@ -61,11 +93,6 @@ class GmeQwen2VL:
                 image_embeds = self.base.visual(pixel_values, grid_thw=image_grid_thw).to(inputs_embeds.device)
                 image_mask = input_ids == self.base.config.image_token_id
                 inputs_embeds[image_mask] = image_embeds
-            # if pixel_values_videos is not None:
-            #     pixel_values_videos = pixel_values_videos.type(self.base.visual.get_dtype())
-            #     video_embeds = self.base.visual(pixel_values_videos, grid_thw=video_grid_thw).to(inputs_embeds.device)
-            #     video_mask = input_ids == self.base.config.video_token_id
-            #     inputs_embeds[video_mask] = video_embeds
             if attention_mask is not None:
                 attention_mask = attention_mask.to(inputs_embeds.device)
@@ -78,36 +105,48 @@ class GmeQwen2VL:
         )
         pooling_mask = attention_mask if pooling_mask is None else pooling_mask
-        left_padding = (pooling_mask[:, -1].sum() == pooling_mask.shape[0])  # TODO
         if left_padding:
             embeddings = outputs.last_hidden_state[:, -1]
         else:
             sequence_lengths = pooling_mask.sum(dim=1) - 1
             batch_size = outputs.last_hidden_state.shape[0]
-            embeddings = outputs.last_hidden_state[torch.arange(
-                batch_size, device=outputs.last_hidden_state.device
-            ), sequence_lengths]
         if self.normalize:
             embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
         return embeddings.contiguous()
-    def embed(self, texts: list[str], images: list[Image.Image], is_query=True, instruction=None, **kwargs):
         self.base.to(self.device)
-        # Inputs must be batched
-        input_texts, input_images = list(), list()
         for t, i in zip(texts, images):
             if not is_query or instruction is None:
                 instruction = self.defualt_instruction
-            input_str = ''
             if i is None:
                 input_images = None  # All examples in the same batch are consistent
             else:
-                input_str += '<|vision_start|><|image_pad|><|vision_end|>'
                 i = fetch_image(i)
                 input_images.append(i)
             if t is not None:
                 input_str += t
-            msg = f'<|im_start|>system\n{instruction}<|im_end|>\n<|im_start|>user\n{input_str}<|im_end|>\n<|im_start|>assistant\n<|endoftext|>'
             input_texts.append(msg)
         inputs = self.processor(
@@ -116,22 +155,22 @@ class GmeQwen2VL:
             padding=True,
             truncation=True,
             max_length=self.max_length,
-            return_tensors='pt'
         )
-        inputs = {k: v.to(self.device) for k, v in inputs.items()}  # TODO
         with torch.no_grad():
             embeddings = self.forward(**inputs)
         return embeddings
-    def encode(self, sentences: list[str], *, prompt_name=None, **kwargs):
-        return self.get_fused_embeddings(texts=sentences, prompt_name=prompt_name, **kwargs)
-    def encode_queries(self, queries: List[str], **kwargs):
-        embeddings = self.encode(queries, **kwargs)
-        return embeddings
-    def encode_corpus(self, corpus: List[Dict[str, str]], **kwargs):
-        if type(corpus) is dict:
             sentences = [
                 (corpus["title"][i] + self.sep + corpus["text"][i]).strip()
                 if "title" in corpus
@@ -143,68 +182,55 @@ class GmeQwen2VL:
                 (doc["title"] + self.sep + doc["text"]).strip() if "title" in doc else doc["text"].strip()
                 for doc in corpus
             ]
-        embeddings = self.encode(sentences, is_query=False, **kwargs)
-        return embeddings
-    def get_image_embeddings(self, images: list[Image.Image] | DataLoader, **kwargs):
         return self.get_fused_embeddings(images=images, **kwargs)
-    def get_text_embeddings(self, texts: list[str], **kwargs):
         return self.get_fused_embeddings(texts=texts, **kwargs)
-    def get_fused_embeddings(self, texts: list[str] = None, images: list[Image.Image] | DataLoader = None, **kwargs):
         if isinstance(images, DataLoader):
             image_loader = images
             batch_size = image_loader.batch_size
             image_loader.dataset.transform = None
         else:
-            batch_size = kwargs.pop('batch_size', 32)
             if images is None:
-                image_loader = None
             else:
-                image_loader = DataLoader(
-                    images,
-                    batch_size=batch_size,
-                    shuffle=False,
-                    collate_fn=custom_collate_fn,
-                    num_workers=min(math.floor(os.cpu_count() / 2), 8),
-                )
-        if texts is None:
-            assert image_loader is not None
-            n_batch = len(image_loader)
-        else:
-            n_batch = len(texts) // batch_size + int(len(texts) % batch_size > 0)
-            image_loader = image_loader or [None] * n_batch
-        all_embeddings = list()
         none_batch = [None] * batch_size
-        show_progress_bar = kwargs.pop('show_progress_bar', True)
-        pbar = tqdm(total=n_batch, disable=not show_progress_bar, mininterval=1, miniters=10, desc='encode')
         for n, img_batch in zip(range(0, n_batch * batch_size, batch_size), image_loader):
-            text_batch = none_batch if texts is None else texts[n: n+batch_size]
             img_batch = none_batch if img_batch is None else img_batch
             embeddings = self.embed(texts=text_batch, images=img_batch, **kwargs)
             pbar.update(1)
             all_embeddings.append(embeddings.cpu())
         pbar.close()
-        all_embeddings = torch.cat(all_embeddings, dim=0)
-        return all_embeddings
-def custom_collate_fn(batch):
-    return batch
-### Copied from qwen_vl_utils.vision_process.py
-import base64
-from io import BytesIO
-import requests
-IMAGE_FACTOR = 28
-MIN_PIXELS = 4 * 28 * 28
-MAX_PIXELS = 16384 * 28 * 28
-MAX_RATIO = 200
 def round_by_factor(number: int, factor: int) -> int:
@@ -226,13 +252,10 @@ def smart_resize(
     height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
 ) -> tuple[int, int]:
     """
-    Rescales the image so that the following conditions are met:
-    1. Both dimensions (height and width) are divisible by 'factor'.
-    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
-    3. The aspect ratio of the image is maintained as closely as possible.
     """
     h_bar = max(factor, round_by_factor(height, factor))
     w_bar = max(factor, round_by_factor(width, factor))
@@ -256,35 +279,27 @@ def smart_resize(
     return h_bar, w_bar
-def fetch_image(image: str | Image.Image, size_factor: int = IMAGE_FACTOR) -> Image.Image:
-    image_obj = None
     if isinstance(image, Image.Image):
         image_obj = image
-    elif image.startswith("http://") or image.startswith("https://"):
         image_obj = Image.open(requests.get(image, stream=True).raw)
-    elif image.startswith("file://"):
         image_obj = Image.open(image[7:])
-    elif image.startswith("data:image"):
         if "base64," in image:
             _, base64_data = image.split("base64,", 1)
             data = base64.b64decode(base64_data)
             image_obj = Image.open(BytesIO(data))
-    else:
         image_obj = Image.open(image)
     if image_obj is None:
-        raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
     image = image_obj.convert("RGB")
-    ## resize
-    # if "resized_height" in ele and "resized_width" in ele:
-    #     resized_height, resized_width = smart_resize(
-    #         ele["resized_height"],
-    #         ele["resized_width"],
-    #         factor=size_factor,
-    #     )
-    # else:
     width, height = image.size
-    # min_pixels = ele.get("min_pixels", MIN_PIXELS)
-    # max_pixels = ele.get("max_pixels", MAX_PIXELS)
     resized_height, resized_width = smart_resize(
         height,
         width,
@@ -293,37 +308,44 @@ def fetch_image(image: str | Image.Image, size_factor: int = IMAGE_FACTOR) -> Im
         max_pixels=MAX_PIXELS,
     )
     image = image.resize((resized_width, resized_height))
     return image
-###
-if __name__ == '__main__':
     texts = [
         "What kind of car is this?",
-        "The Tesla Cybertruck is a battery electric pickup truck built by Tesla, Inc. since 2023."
     ]
     images = [
-        'https://en.wikipedia.org/wiki/File:Tesla_Cybertruck_damaged_window.jpg',
-        'https://en.wikipedia.org/wiki/File:2024_Tesla_Cybertruck_Foundation_Series,_front_left_(Greenwich).jpg',
     ]
-    gme = GmeQwen2VL("Alibaba-NLP/gme-Qwen2-VL-2B-Instruct")
-    # Single-modal embedding
-    e_text = gme.get_text_embeddings(texts=texts)
-    e_image = gme.get_image_embeddings(images=images)
-    print((e_text * e_image).sum(-1))
-    ## tensor([0.2281, 0.6001], dtype=torch.float16)
-    # How to set embedding instruction
-    e_query = gme.get_text_embeddings(texts=texts, instruction='Find an image that matches the given text.')
-    # If is_query=False, we always use the default instruction.
-    e_corpus = gme.get_image_embeddings(images=images, is_query=False)
-    print((e_query * e_corpus).sum(-1))
-    ## tensor([0.2433, 0.7051], dtype=torch.float16)
-    # Fused-modal embedding
-    e_fused = gme.get_fused_embeddings(texts=texts, images=images)
-    print((e_fused[0] * e_fused[1]).sum())
-    ## tensor(0.6108, dtype=torch.float16)

 from __future__ import annotations
+import base64
 import logging
 import math
 import os
+from io import BytesIO
+from typing import Any, Dict, List, Optional, Union
+import requests
 import torch
 from PIL import Image
 from torch.utils.data import DataLoader
 from tqdm.autonotebook import tqdm
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoModelForVision2Seq,
+    AutoProcessor,
+    PreTrainedModel,
+    Qwen2VLConfig,
+    Qwen2VLModel,
+)
+import os
+# Define a config class for our model.
+class GmeQwen2VLConfig(Qwen2VLConfig):
+    model_type: str = "gme_qwen2_vl"
     def __init__(
         self,
+        min_image_tokens: int = 256,
+        max_image_tokens: int = 1280,
+        max_length: int = 1800,
         device: str = "cuda" if torch.cuda.is_available() else "cpu",
+        **kwargs: Any,
     ) -> None:
+        super().__init__(**kwargs)
+        self.min_image_tokens = min_image_tokens
+        self.max_image_tokens = max_image_tokens
         self.max_length = max_length
+        self.device = device
+AutoConfig.register("gme_qwen2_vl", GmeQwen2VLConfig)
+# Define the model class so that it can be loaded by AutoModel.from_pretrained.
+class GmeQwen2VLForVision2Seq(PreTrainedModel):
+    config_class = GmeQwen2VLConfig
+    base_model_prefix: str = "base"
+    def __init__(self, config: GmeQwen2VLConfig, **kwargs: Any) -> None:
+        super().__init__(config)
+        model_name: str = getattr(config, "_name_or_path", "Alibaba-NLP/gme-Qwen2-VL-2B-Instruct")
+        # Load the underlying vision-to-sequence model.
+        self.base = Qwen2VLModel.from_pretrained(
+            model_name, trust_remote_code=True, **kwargs
+        )
+        self.normalize: bool = True
+        self.device: str = config.device
+        min_pixels: int = config.min_image_tokens * 28 * 28
+        max_pixels: int = config.max_image_tokens * 28 * 28
+        self.max_length: int = config.max_length
         self.processor = AutoProcessor.from_pretrained(
             model_name, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs
         )
+        self.processor.tokenizer.padding_side = "right"
+        self.defualt_instruction: str = "You are a helpful assistant."
+        self.sep: str = " "
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> GmeQwen2VLForVision2Seq:
+        config = kwargs.pop("config", GmeQwen2VLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs))
+        return cls(config, **kwargs)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         pixel_values: Optional[torch.Tensor] = None,
         image_grid_thw: Optional[torch.LongTensor] = None,
         pooling_mask: Optional[torch.LongTensor] = None,
+        **kwargs: Any,
     ) -> torch.Tensor:
         if inputs_embeds is None:
             inputs_embeds = self.base.model.embed_tokens(input_ids)
                 image_embeds = self.base.visual(pixel_values, grid_thw=image_grid_thw).to(inputs_embeds.device)
                 image_mask = input_ids == self.base.config.image_token_id
                 inputs_embeds[image_mask] = image_embeds
             if attention_mask is not None:
                 attention_mask = attention_mask.to(inputs_embeds.device)
         )
         pooling_mask = attention_mask if pooling_mask is None else pooling_mask
+        left_padding: bool = (pooling_mask[:, -1].sum() == pooling_mask.shape[0])
         if left_padding:
             embeddings = outputs.last_hidden_state[:, -1]
         else:
             sequence_lengths = pooling_mask.sum(dim=1) - 1
             batch_size = outputs.last_hidden_state.shape[0]
+            embeddings = outputs.last_hidden_state[
+                torch.arange(batch_size, device=outputs.last_hidden_state.device),
+                sequence_lengths,
+            ]
         if self.normalize:
             embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
         return embeddings.contiguous()
+    def embed(
+        self,
+        texts: List[str],
+        images: List[Image.Image],
+        is_query: bool = True,
+        instruction: Optional[str] = None,
+        **kwargs: Any,
+    ) -> torch.Tensor:
         self.base.to(self.device)
+        input_texts: List[str] = []
+        input_images: List[Image.Image] = []
         for t, i in zip(texts, images):
             if not is_query or instruction is None:
                 instruction = self.defualt_instruction
+            input_str: str = ""
             if i is None:
                 input_images = None  # All examples in the same batch are consistent
             else:
+                input_str += "<|vision_start|><|image_pad|><|vision_end|>"
                 i = fetch_image(i)
                 input_images.append(i)
             if t is not None:
                 input_str += t
+            msg: str = (
+                f"<|im_start|>system\n{instruction}<|im_end|>\n"
+                f"<|im_start|>user\n{input_str}<|im_end|>\n"
+                f"<|im_start|>assistant\n<|endoftext|>"
+            )
             input_texts.append(msg)
         inputs = self.processor(
             padding=True,
             truncation=True,
             max_length=self.max_length,
+            return_tensors="pt",
         )
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
         with torch.no_grad():
             embeddings = self.forward(**inputs)
         return embeddings
+    def encode(self, sentences: List[str], **kwargs: Any) -> torch.Tensor:
+        # When no images are provided, we pass a list of Nones.
+        return self.embed(texts=sentences, images=[None] * len(sentences), **kwargs)
+    def encode_queries(self, queries: List[str], **kwargs: Any) -> torch.Tensor:
+        return self.encode(queries, **kwargs)
+    def encode_corpus(self, corpus: Union[Dict[str, List[str]], List[Dict[str, str]]], **kwargs: Any) -> torch.Tensor:
+        if isinstance(corpus, dict):
             sentences = [
                 (corpus["title"][i] + self.sep + corpus["text"][i]).strip()
                 if "title" in corpus
                 (doc["title"] + self.sep + doc["text"]).strip() if "title" in doc else doc["text"].strip()
                 for doc in corpus
             ]
+        return self.encode(sentences, is_query=False, **kwargs)
+    def get_image_embeddings(self, images: Union[List[Image.Image], DataLoader], **kwargs: Any) -> torch.Tensor:
         return self.get_fused_embeddings(images=images, **kwargs)
+    def get_text_embeddings(self, texts: List[str], **kwargs: Any) -> torch.Tensor:
         return self.get_fused_embeddings(texts=texts, **kwargs)
+    def get_fused_embeddings(
+        self,
+        texts: Optional[List[str]] = None,
+        images: Optional[Union[List[Image.Image], DataLoader]] = None,
+        **kwargs: Any,
+    ) -> torch.Tensor:
         if isinstance(images, DataLoader):
             image_loader = images
             batch_size = image_loader.batch_size
             image_loader.dataset.transform = None
         else:
+            batch_size = kwargs.pop("batch_size", 32)
             if images is None:
+                # If texts are provided without images, create dummy image batches.
+                image_loader = [None] * ((len(texts) + batch_size - 1) // batch_size)
             else:
+                image_loader = images
+        n_batch: int = (len(texts) // batch_size + int(len(texts) % batch_size > 0)) if texts is not None else len(image_loader)
+        all_embeddings: List[torch.Tensor] = []
         none_batch = [None] * batch_size
+        show_progress_bar: bool = kwargs.pop("show_progress_bar", True)
+        pbar = tqdm(total=n_batch, disable=not show_progress_bar, mininterval=1, miniters=10, desc="encode")
         for n, img_batch in zip(range(0, n_batch * batch_size, batch_size), image_loader):
+            text_batch: List[Optional[str]] = none_batch if texts is None else texts[n: n + batch_size]
             img_batch = none_batch if img_batch is None else img_batch
             embeddings = self.embed(texts=text_batch, images=img_batch, **kwargs)
             pbar.update(1)
             all_embeddings.append(embeddings.cpu())
         pbar.close()
+        return torch.cat(all_embeddings, dim=0)
+from transformers import AutoModelForVision2Seq
+AutoModelForVision2Seq.register(GmeQwen2VLConfig, GmeQwen2VLForVision2Seq)
+# Utility functions (copied from your vision processing code)
+IMAGE_FACTOR: int = 28
+MIN_PIXELS: int = 4 * 28 * 28
+MAX_PIXELS: int = 16384 * 28 * 28
+MAX_RATIO: int = 200
 def round_by_factor(number: int, factor: int) -> int:
     height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
 ) -> tuple[int, int]:
     """
+    Rescales the image so that:
+      1. Both dimensions are divisible by 'factor'.
+      2. Total pixels fall between ['min_pixels', 'max_pixels'].
+      3. Aspect ratio is maintained as closely as possible.
     """
     h_bar = max(factor, round_by_factor(height, factor))
     w_bar = max(factor, round_by_factor(width, factor))
     return h_bar, w_bar
+def fetch_image(image: Union[str, Image.Image], size_factor: int = IMAGE_FACTOR) -> Image.Image:
+    image_obj: Optional[Image.Image] = None
     if isinstance(image, Image.Image):
         image_obj = image
+    elif isinstance(image, str) and (image.startswith("http://") or image.startswith("https://")):
         image_obj = Image.open(requests.get(image, stream=True).raw)
+    elif isinstance(image, str) and image.startswith("file://"):
         image_obj = Image.open(image[7:])
+    elif isinstance(image, str) and image.startswith("data:image"):
         if "base64," in image:
             _, base64_data = image.split("base64,", 1)
             data = base64.b64decode(base64_data)
             image_obj = Image.open(BytesIO(data))
+    elif isinstance(image, str):
         image_obj = Image.open(image)
     if image_obj is None:
+        raise ValueError(
+            f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}"
+        )
     image = image_obj.convert("RGB")
     width, height = image.size
     resized_height, resized_width = smart_resize(
         height,
         width,
         max_pixels=MAX_PIXELS,
     )
     image = image.resize((resized_width, resized_height))
     return image
+# # For backward compatibility, you can add a from_pretrained classmethod.
+# @classmethod
+# def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> GmeQwen2VLForVision2Seq:
+#     config = GmeQwen2VLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+#     return cls(config, **kwargs)
+# # Monkey-patch the from_pretrained method to our class so that
+# # one can load the model with AutoModel.from_pretrained.
+# GmeQwen2VLForVision2Seq.from_pretrained = from_pretrained.__get__(GmeQwen2VLForVision2Seq)
+if __name__ == "__main__":
     texts = [
         "What kind of car is this?",
+        "The Tesla Cybertruck is a battery electric pickup truck built by Tesla, Inc. since 2023.",
     ]
     images = [
+        "https://en.wikipedia.org/wiki/File:Tesla_Cybertruck_damaged_window.jpg",
+        "https://en.wikipedia.org/wiki/File:2024_Tesla_Cybertruck_Foundation_Series,_front_left_(Greenwich).jpg",
     ]
+    # You can now load your model with AutoModel as long as your repository's config JSON has the "architectures" field set.
+    model = AutoModel.from_pretrained("Alibaba-NLP/gme-Qwen2-VL-2B-Instruct")
+    # Alternatively, load it directly via our class:
+    # model = GmeQwen2VLForVision2Seq.from_pretrained("Alibaba-NLP/gme-Qwen2-VL-2B-Instruct")
+    # Single-modal embedding examples:
+    e_text = model.get_text_embeddings(texts=texts)
+    e_image = model.get_image_embeddings(images=images)
+    print("Text-Image similarity:", (e_text * e_image).sum(-1))
+    # Example with different instruction:
+    e_query = model.get_text_embeddings(texts=texts, instruction="Find an image that matches the given text.")
+    e_corpus = model.get_image_embeddings(images=images, is_query=False)
+    print("Query-Corpus similarity:", (e_query * e_corpus).sum(-1))
+    # Fused-modal embedding:
+    e_fused = model.get_fused_embeddings(texts=texts, images=images)
+    print("Fused-modal similarity:", (e_fused[0] * e_fused[1]).sum())