Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

model.safetensors +2 -2
processing_maira2.py +15 -20
tokenizer.model +2 -2

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0a655577b6a338c3cef1d9cd004220624b5d2ec80f3756b00059e5d1b32e8355
-size 132

 version https://git-lfs.github.com/spec/v1
+oid sha256:d3bfb1d6f0ec0f0949cd84df187a2bfb571242c4ca9bdd519c4af512716ae23a
+size 4240896

processing_maira2.py CHANGED Viewed

@@ -55,9 +55,9 @@ class Maira2Processor(LlavaProcessor):
         self,
         image_processor: BaseImageProcessor = None,
         tokenizer: PreTrainedTokenizer = None,
-        patch_size=None,
-        vision_feature_select_strategy=None,
-        chat_template=None,
         image_token: str = "<image>",
         phrase_start_token: str = "<obj>",
         phrase_end_token: str = "</obj>",
@@ -301,12 +301,12 @@ class Maira2Processor(LlavaProcessor):
             )
         messages = [{"content": prompt, "role": "user"}]
         if assistant_text is not None:
-            messages.append(
-                {"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"}
-            )
         return messages
-    def _construct_chat_messages_phrase_grounding(self, phrase: str, assistant_text: str = None):
         """
         This function constructs the chat messages for phrase grounding used in the phrase grounding task.
@@ -331,9 +331,7 @@ class Maira2Processor(LlavaProcessor):
         ]
         messages = [{"content": prompt, "role": "user"}]
         if assistant_text is not None:
-            messages.append(
-                {"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"}
-            )
         return messages
     def format_reporting_input(
@@ -390,9 +388,7 @@ class Maira2Processor(LlavaProcessor):
             assistant_text=assistant_text,
         )
         add_generation_prompt = assistant_text is None
-        text = self.tokenizer.apply_chat_template(
-            messages, add_generation_prompt=add_generation_prompt, tokenize=False
-        )
         return text, images
     def format_phrase_grounding_input(
@@ -423,9 +419,7 @@ class Maira2Processor(LlavaProcessor):
         )
         messages = self._construct_chat_messages_phrase_grounding(phrase)
         add_generation_prompt = assistant_text is None
-        text = self.tokenizer.apply_chat_template(
-            messages, add_generation_prompt=add_generation_prompt, tokenize=False
-        )
         return text, images
     def format_and_preprocess_reporting_input(
@@ -548,7 +542,9 @@ class Maira2Processor(LlavaProcessor):
         assert len(text) == 0
         return split_text
-    def convert_output_to_plaintext_or_grounded_sequence(self, text: str):
         """
         This function converts the input text to a grounded sequence by extracting the grounded phrases and bounding
         boxes from the text. If the text is plaintext without any grounded phrases, it returns the text as is.
@@ -703,7 +699,6 @@ class Maira2Processor(LlavaProcessor):
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
         if images is not None:
             image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
         else:
@@ -730,6 +725,6 @@ class Maira2Processor(LlavaProcessor):
                     sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
                     prompt_strings.append(sample)
-        output_kwargs.pop("return_mm_token_type_ids")
         text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
-        return BatchFeature(data={**text_inputs, **image_inputs})

         self,
         image_processor: BaseImageProcessor = None,
         tokenizer: PreTrainedTokenizer = None,
+        patch_size = None,
+        vision_feature_select_strategy = None,
+        chat_template = None,
         image_token: str = "<image>",
         phrase_start_token: str = "<obj>",
         phrase_end_token: str = "</obj>",
             )
         messages = [{"content": prompt, "role": "user"}]
         if assistant_text is not None:
+            messages.append({"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"})
         return messages
+    def _construct_chat_messages_phrase_grounding(
+        self, phrase: str, assistant_text: str = None
+    ):
         """
         This function constructs the chat messages for phrase grounding used in the phrase grounding task.
         ]
         messages = [{"content": prompt, "role": "user"}]
         if assistant_text is not None:
+            messages.append({"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"})
         return messages
     def format_reporting_input(
             assistant_text=assistant_text,
         )
         add_generation_prompt = assistant_text is None
+        text = self.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False)
         return text, images
     def format_phrase_grounding_input(
         )
         messages = self._construct_chat_messages_phrase_grounding(phrase)
         add_generation_prompt = assistant_text is None
+        text = self.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False)
         return text, images
     def format_and_preprocess_reporting_input(
         assert len(text) == 0
         return split_text
+    def convert_output_to_plaintext_or_grounded_sequence(
+        self, text: str
+    ):
         """
         This function converts the input text to a grounded sequence by extracting the grounded phrases and bounding
         boxes from the text. If the text is plaintext without any grounded phrases, it returns the text as is.
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
         if images is not None:
             image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
         else:
                     sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
                     prompt_strings.append(sample)
+        output_kwargs.pop("return_mm_token_type_ids", None)
         text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
+        return BatchFeature(data={**text_inputs, **image_inputs})

tokenizer.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1a8f238a200be6c23fbba0f9a999ab4fe3c09ca303b29805e68cf6659bfb7d89
-size 131

 version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723