save

Files changed (5) hide show

.gitattributes +1 -0
README.md +0 -1
config.json +17 -95
configuration_clip.py +80 -46
pytorch_model.bin → model.safetensors +2 -2

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+model.safetensors filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -37,7 +37,6 @@ import torch
 image_path = "CLIP.png"
 model_name_or_path = "LLM2CLIP-Openai-L-14-336" # or /path/to/local/LLM2CLIP-Openai-L-14-336
-image_size =336
 processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-336")
 model = AutoModel.from_pretrained(

 image_path = "CLIP.png"
 model_name_or_path = "LLM2CLIP-Openai-L-14-336" # or /path/to/local/LLM2CLIP-Openai-L-14-336
 processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-336")
 model = AutoModel.from_pretrained(

config.json CHANGED Viewed

@@ -1,97 +1,16 @@
 {
-  "_commit_hash": null,
-  "_name_or_path": "LLM2CLIP-Openai-L-14",
   "architectures": [
-    "CLIPModel"
   ],
   "auto_map": {
     "AutoConfig": "configuration_clip.CLIPConfig",
-    "AutoModel": "modeling_clip.CLIPModel"
   },
   "initializer_factor": 1.0,
   "logit_scale_init_value": 2.6592,
   "model_type": "clip",
   "projection_dim": 1280,
-  "text_config": {
-    "_name_or_path": "",
-    "add_cross_attention": false,
-    "architectures": null,
-    "attention_dropout": 0.0,
-    "bad_words_ids": null,
-    "begin_suppress_tokens": null,
-    "bos_token_id": 0,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "early_stopping": false,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": 2,
-    "exponential_decay_length_penalty": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "hidden_act": "gelu",
-    "hidden_size": 512,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
-    "initializer_factor": 1.0,
-    "initializer_range": 0.02,
-    "intermediate_size": 2048,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "k_bias": true,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
-    "layer_norm_eps": 1e-05,
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "max_position_embeddings": 77,
-    "min_length": 0,
-    "model_type": "clip_text_model",
-    "no_repeat_ngram_size": 0,
-    "num_attention_heads": 8,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_hidden_layers": 12,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": 1,
-    "post_layernorm": false,
-    "prefix": null,
-    "problem_type": null,
-    "projection_dim": 512,
-    "pruned_heads": {},
-    "q_bias": true,
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "sep_token_id": null,
-    "suppress_tokens": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tf_legacy_loss": false,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torch_dtype": null,
-    "torchscript": false,
-    "transformers_version": "4.44.2",
-    "typical_p": 1.0,
-    "use_bfloat16": false,
-    "v_bias": true,
-    "vocab_size": 49408
-  },
   "torch_dtype": "float32",
   "transformers_version": null,
   "vision_config": {
@@ -100,7 +19,6 @@
     "architectures": null,
     "attention_dropout": 0.0,
     "bad_words_ids": null,
-    "begin_suppress_tokens": null,
     "bos_token_id": null,
     "chunk_size_feed_forward": 0,
     "cross_attention_hidden_size": null,
@@ -115,7 +33,7 @@
     "finetuning_task": null,
     "forced_bos_token_id": null,
     "forced_eos_token_id": null,
-    "hidden_act": "gelu",
     "hidden_size": 1024,
     "id2label": {
       "0": "LABEL_0",
@@ -127,7 +45,6 @@
     "intermediate_size": 4096,
     "is_decoder": false,
     "is_encoder_decoder": false,
-    "k_bias": true,
     "label2id": {
       "LABEL_0": 0,
       "LABEL_1": 1
@@ -149,18 +66,15 @@
     "output_scores": false,
     "pad_token_id": null,
     "patch_size": 14,
-    "post_layernorm": false,
     "prefix": null,
     "problem_type": null,
-    "projection_dim": 768,
     "pruned_heads": {},
-    "q_bias": true,
     "remove_invalid_values": false,
     "repetition_penalty": 1.0,
     "return_dict": true,
     "return_dict_in_generate": false,
     "sep_token_id": null,
-    "suppress_tokens": null,
     "task_specific_params": null,
     "temperature": 1.0,
     "tf_legacy_loss": false,
@@ -171,9 +85,17 @@
     "top_p": 1.0,
     "torch_dtype": null,
     "torchscript": false,
-    "transformers_version": "4.44.2",
     "typical_p": 1.0,
-    "use_bfloat16": false,
-    "v_bias": true
   }
-}

 {
+  "_name_or_path": "LLM2CLIP-Openai-L-14-336",
   "architectures": [
+    "LLM2CLIPModel"
   ],
   "auto_map": {
     "AutoConfig": "configuration_clip.CLIPConfig",
+    "AutoModel": "modeling_clip.LLM2CLIPModel"
   },
   "initializer_factor": 1.0,
   "logit_scale_init_value": 2.6592,
   "model_type": "clip",
   "projection_dim": 1280,
   "torch_dtype": "float32",
   "transformers_version": null,
   "vision_config": {
     "architectures": null,
     "attention_dropout": 0.0,
     "bad_words_ids": null,
     "bos_token_id": null,
     "chunk_size_feed_forward": 0,
     "cross_attention_hidden_size": null,
     "finetuning_task": null,
     "forced_bos_token_id": null,
     "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
     "hidden_size": 1024,
     "id2label": {
       "0": "LABEL_0",
     "intermediate_size": 4096,
     "is_decoder": false,
     "is_encoder_decoder": false,
     "label2id": {
       "LABEL_0": 0,
       "LABEL_1": 1
     "output_scores": false,
     "pad_token_id": null,
     "patch_size": 14,
     "prefix": null,
     "problem_type": null,
+    "projection_dim": 1280,
     "pruned_heads": {},
     "remove_invalid_values": false,
     "repetition_penalty": 1.0,
     "return_dict": true,
     "return_dict_in_generate": false,
     "sep_token_id": null,
     "task_specific_params": null,
     "temperature": 1.0,
     "tf_legacy_loss": false,
     "top_p": 1.0,
     "torch_dtype": null,
     "torchscript": false,
+    "transformers_version": "4.21.3",
     "typical_p": 1.0,
+    "use_bfloat16": false
+  },
+  "vision_config_dict": {
+    "hidden_size": 1024,
+    "image_size": 336,
+    "intermediate_size": 4096,
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "patch_size": 14,
+    "projection_dim": 1280
   }
+}

configuration_clip.py CHANGED Viewed

@@ -26,9 +26,9 @@ if TYPE_CHECKING:
     from transformers.utils import TensorType
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
@@ -50,25 +50,33 @@ class CLIPTextConfig(PretrainedConfig):
             Dimensionality of the encoder layers and the pooler layer.
         intermediate_size (`int`, *optional*, defaults to 2048):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
         num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer encoder.
-        max_position_embeddings (`int`, *optional*, defaults to 77):`
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
             The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float`, *optional*, defaults to 1):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
     Example:
@@ -84,6 +92,7 @@ class CLIPTextConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
     model_type = "clip_text_model"
     def __init__(
@@ -95,18 +104,16 @@ class CLIPTextConfig(PretrainedConfig):
         num_hidden_layers=12,
         num_attention_heads=8,
         max_position_embeddings=77,
-        hidden_act="gelu",
         layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
-        q_bias=True,
-        k_bias=True,
-        v_bias=True,
-        post_layernorm=False,
         pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@@ -122,14 +129,12 @@ class CLIPTextConfig(PretrainedConfig):
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
         self.initializer_factor = initializer_factor
-        self.q_bias=q_bias
-        self.k_bias=k_bias
-        self.v_bias=v_bias
-        self.post_layernorm = post_layernorm
         self.attention_dropout = attention_dropout
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
         # get the text config dict if we are loading from CLIPConfig
@@ -160,24 +165,28 @@ class CLIPVisionConfig(PretrainedConfig):
             Dimensionality of the encoder layers and the pooler layer.
         intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
         num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
         image_size (`int`, *optional*, defaults to 224):
             The size (resolution) of each image.
         patch_size (`int`, *optional*, defaults to 32):
             The size (resolution) of each patch.
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
             The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float`, *optional*, defaults to 1):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
@@ -208,15 +217,11 @@ class CLIPVisionConfig(PretrainedConfig):
         num_channels=3,
         image_size=224,
         patch_size=32,
-        hidden_act="gelu",
         layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
-        q_bias=True,
-        k_bias=True,
-        v_bias=True,
-        post_layernorm=False,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -231,16 +236,14 @@ class CLIPVisionConfig(PretrainedConfig):
         self.image_size = image_size
         self.initializer_range = initializer_range
         self.initializer_factor = initializer_factor
-        self.q_bias=q_bias
-        self.k_bias=k_bias
-        self.v_bias=v_bias
-        self.post_layernorm = post_layernorm
         self.attention_dropout = attention_dropout
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
         # get the vision config dict if we are loading from CLIPConfig
@@ -272,9 +275,9 @@ class CLIPConfig(PretrainedConfig):
         vision_config (`dict`, *optional*):
             Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
         projection_dim (`int`, *optional*, defaults to 512):
-            Dimentionality of text and vision projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
-            The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
         kwargs (*optional*):
             Dictionary of keyword arguments.
@@ -303,7 +306,6 @@ class CLIPConfig(PretrainedConfig):
     ```"""
     model_type = "clip"
-    is_composition = True
     def __init__(
         self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
@@ -339,9 +341,9 @@ class CLIPConfig(PretrainedConfig):
                     else:
                         message = (
                             f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
-                            f'value `text_config["{key}"]` will be overriden.'
                         )
-                    logger.warning(message)
             # Update all values in `text_config` with the ones in `_text_config_dict`.
             text_config.update(_text_config_dict)
@@ -371,9 +373,9 @@ class CLIPConfig(PretrainedConfig):
                     else:
                         message = (
                             f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
-                            f'The value `vision_config["{key}"]` will be overriden.'
                         )
-                    logger.warning(message)
             # Update all values in `vision_config` with the ones in `_vision_config_dict`.
             vision_config.update(_vision_config_dict)
@@ -405,16 +407,48 @@ class CLIPConfig(PretrainedConfig):
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output

     from transformers.utils import TensorType
 from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
             Dimensionality of the encoder layers and the pooler layer.
         intermediate_size (`int`, *optional*, defaults to 2048):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and vision projection layers.
         num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 77):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
+        pad_token_id (`int`, *optional*, defaults to 1):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 49406):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 49407):
+            End of stream token id.
     Example:
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
     model_type = "clip_text_model"
     def __init__(
         num_hidden_layers=12,
         num_attention_heads=8,
         max_position_embeddings=77,
+        hidden_act="quick_gelu",
         layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
+        # This differs from `CLIPTokenizer`'s default and from openai/clip
+        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
         pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
         self.initializer_factor = initializer_factor
         self.attention_dropout = attention_dropout
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
         # get the text config dict if we are loading from CLIPConfig
             Dimensionality of the encoder layers and the pooler layer.
         intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and vision projection layers.
         num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
         image_size (`int`, *optional*, defaults to 224):
             The size (resolution) of each image.
         patch_size (`int`, *optional*, defaults to 32):
             The size (resolution) of each patch.
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
         num_channels=3,
         image_size=224,
         patch_size=32,
+        hidden_act="quick_gelu",
         layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.image_size = image_size
         self.initializer_range = initializer_range
         self.initializer_factor = initializer_factor
         self.attention_dropout = attention_dropout
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
         # get the vision config dict if we are loading from CLIPConfig
         vision_config (`dict`, *optional*):
             Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
         projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and vision projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The initial value of the *logit_scale* parameter. Default is used as per the original CLIP implementation.
         kwargs (*optional*):
             Dictionary of keyword arguments.
     ```"""
     model_type = "clip"
     def __init__(
         self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
                     else:
                         message = (
                             f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
+                            f'value `text_config["{key}"]` will be overridden.'
                         )
+                    logger.info(message)
             # Update all values in `text_config` with the ones in `_text_config_dict`.
             text_config.update(_text_config_dict)
                     else:
                         message = (
                             f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
+                            f'The value `vision_config["{key}"]` will be overridden.'
                         )
+                    logger.info(message)
             # Update all values in `vision_config` with the ones in `_vision_config_dict`.
             vision_config.update(_vision_config_dict)
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+class CLIPOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+                ("attention_mask", {0: "batch", 1: "sequence"}),
+            ]
+        )
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("logits_per_image", {0: "batch"}),
+                ("logits_per_text", {0: "batch"}),
+                ("text_embeds", {0: "batch"}),
+                ("image_embeds", {0: "batch"}),
+            ]
+        )
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+    def generate_dummy_inputs(
+        self,
+        processor: "ProcessorMixin",
+        batch_size: int = -1,
+        seq_length: int = -1,
+        framework: Optional["TensorType"] = None,
+    ) -> Mapping[str, Any]:
+        text_input_dict = super().generate_dummy_inputs(
+            processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
+        )
+        image_input_dict = super().generate_dummy_inputs(
+            processor.image_processor, batch_size=batch_size, framework=framework
+        )
+        return {**text_input_dict, **image_input_dict}
+    @property
+    def default_onnx_opset(self) -> int:
+        return 14

pytorch_model.bin → model.safetensors RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0921e6e15fae7a2a28459008d97c50c7fc099bad0bc57bf1573f28e9354a3cbc
-size 1219403118

 version https://git-lfs.github.com/spec/v1
+oid sha256:b735de584f3270fe5a818cba1724bf387d90e14ecf703fb5ac2829a16c711961
+size 2314403228