optimum-internal-testing
/

tiny-random-maira2

Safetensors

maira2

custom_code

Model card Files Files and versions

xet

Community

IlyasMoutawwakil HF Staff commited on Jul 11

Commit

16fcb08

verified ·

1 Parent(s): 1dd02e1

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

processing_maira2.py +23 -18

processing_maira2.py CHANGED Viewed

@@ -3,17 +3,18 @@
 import re
-from typing import Any, Union, List
 import numpy as np
 from PIL import Image
 from transformers import BaseImageProcessor, LlavaProcessor, PreTrainedTokenizer
-from transformers.models.llava.processing_llava import LlavaProcessorKwargs
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
-from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 # SingleChatMessageType: TypeAlias = dict[str, str | int | None]
 # ChatMessageListType: TypeAlias = list[dict[str, str | list[SingleChatMessageType]]]
 # BoxType: TypeAlias = tuple[float, float, float, float]
@@ -55,9 +56,9 @@ class Maira2Processor(LlavaProcessor):
         self,
         image_processor: BaseImageProcessor = None,
         tokenizer: PreTrainedTokenizer = None,
-        patch_size = None,
-        vision_feature_select_strategy = None,
-        chat_template = None,
         image_token: str = "<image>",
         phrase_start_token: str = "<obj>",
         phrase_end_token: str = "</obj>",
@@ -301,12 +302,12 @@ class Maira2Processor(LlavaProcessor):
             )
         messages = [{"content": prompt, "role": "user"}]
         if assistant_text is not None:
-            messages.append({"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"})
         return messages
-    def _construct_chat_messages_phrase_grounding(
-        self, phrase: str, assistant_text: str = None
-    ):
         """
         This function constructs the chat messages for phrase grounding used in the phrase grounding task.
@@ -331,7 +332,9 @@ class Maira2Processor(LlavaProcessor):
         ]
         messages = [{"content": prompt, "role": "user"}]
         if assistant_text is not None:
-            messages.append({"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"})
         return messages
     def format_reporting_input(
@@ -388,7 +391,9 @@ class Maira2Processor(LlavaProcessor):
             assistant_text=assistant_text,
         )
         add_generation_prompt = assistant_text is None
-        text = self.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False)
         return text, images
     def format_phrase_grounding_input(
@@ -419,7 +424,9 @@ class Maira2Processor(LlavaProcessor):
         )
         messages = self._construct_chat_messages_phrase_grounding(phrase)
         add_generation_prompt = assistant_text is None
-        text = self.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False)
         return text, images
     def format_and_preprocess_reporting_input(
@@ -542,9 +549,7 @@ class Maira2Processor(LlavaProcessor):
         assert len(text) == 0
         return split_text
-    def convert_output_to_plaintext_or_grounded_sequence(
-        self, text: str
-    ):
         """
         This function converts the input text to a grounded sequence by extracting the grounded phrases and bounding
         boxes from the text. If the text is plaintext without any grounded phrases, it returns the text as is.
@@ -725,6 +730,6 @@ class Maira2Processor(LlavaProcessor):
                     sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
                     prompt_strings.append(sample)
-        output_kwargs.pop("return_mm_token_type_ids", None)
         text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
-        return BatchFeature(data={**text_inputs, **image_inputs})

 import re
+from typing import Any, List, Union
 import numpy as np
 from PIL import Image
 from transformers import BaseImageProcessor, LlavaProcessor, PreTrainedTokenizer
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
+from transformers.models.llava.processing_llava import LlavaProcessorKwargs
+from transformers.processing_utils import Unpack, _validate_images_text_input_order
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 # SingleChatMessageType: TypeAlias = dict[str, str | int | None]
 # ChatMessageListType: TypeAlias = list[dict[str, str | list[SingleChatMessageType]]]
 # BoxType: TypeAlias = tuple[float, float, float, float]
         self,
         image_processor: BaseImageProcessor = None,
         tokenizer: PreTrainedTokenizer = None,
+        patch_size=None,
+        vision_feature_select_strategy=None,
+        chat_template=None,
         image_token: str = "<image>",
         phrase_start_token: str = "<obj>",
         phrase_end_token: str = "</obj>",
             )
         messages = [{"content": prompt, "role": "user"}]
         if assistant_text is not None:
+            messages.append(
+                {"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"}
+            )
         return messages
+    def _construct_chat_messages_phrase_grounding(self, phrase: str, assistant_text: str = None):
         """
         This function constructs the chat messages for phrase grounding used in the phrase grounding task.
         ]
         messages = [{"content": prompt, "role": "user"}]
         if assistant_text is not None:
+            messages.append(
+                {"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"}
+            )
         return messages
     def format_reporting_input(
             assistant_text=assistant_text,
         )
         add_generation_prompt = assistant_text is None
+        text = self.tokenizer.apply_chat_template(
+            messages, add_generation_prompt=add_generation_prompt, tokenize=False
+        )
         return text, images
     def format_phrase_grounding_input(
         )
         messages = self._construct_chat_messages_phrase_grounding(phrase)
         add_generation_prompt = assistant_text is None
+        text = self.tokenizer.apply_chat_template(
+            messages, add_generation_prompt=add_generation_prompt, tokenize=False
+        )
         return text, images
     def format_and_preprocess_reporting_input(
         assert len(text) == 0
         return split_text
+    def convert_output_to_plaintext_or_grounded_sequence(self, text: str):
         """
         This function converts the input text to a grounded sequence by extracting the grounded phrases and bounding
         boxes from the text. If the text is plaintext without any grounded phrases, it returns the text as is.
                     sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
                     prompt_strings.append(sample)
+        output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
         text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
+        return BatchFeature(data={**text_inputs, **image_inputs})