Upload folder using huggingface_hub
Browse files- model.safetensors +2 -2
- processing_maira2.py +15 -20
- tokenizer.model +2 -2
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d3bfb1d6f0ec0f0949cd84df187a2bfb571242c4ca9bdd519c4af512716ae23a
|
| 3 |
+
size 4240896
|
processing_maira2.py
CHANGED
|
@@ -55,9 +55,9 @@ class Maira2Processor(LlavaProcessor):
|
|
| 55 |
self,
|
| 56 |
image_processor: BaseImageProcessor = None,
|
| 57 |
tokenizer: PreTrainedTokenizer = None,
|
| 58 |
-
patch_size=None,
|
| 59 |
-
vision_feature_select_strategy=None,
|
| 60 |
-
chat_template=None,
|
| 61 |
image_token: str = "<image>",
|
| 62 |
phrase_start_token: str = "<obj>",
|
| 63 |
phrase_end_token: str = "</obj>",
|
|
@@ -301,12 +301,12 @@ class Maira2Processor(LlavaProcessor):
|
|
| 301 |
)
|
| 302 |
messages = [{"content": prompt, "role": "user"}]
|
| 303 |
if assistant_text is not None:
|
| 304 |
-
messages.append(
|
| 305 |
-
{"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"}
|
| 306 |
-
)
|
| 307 |
return messages
|
| 308 |
|
| 309 |
-
def _construct_chat_messages_phrase_grounding(
|
|
|
|
|
|
|
| 310 |
"""
|
| 311 |
This function constructs the chat messages for phrase grounding used in the phrase grounding task.
|
| 312 |
|
|
@@ -331,9 +331,7 @@ class Maira2Processor(LlavaProcessor):
|
|
| 331 |
]
|
| 332 |
messages = [{"content": prompt, "role": "user"}]
|
| 333 |
if assistant_text is not None:
|
| 334 |
-
messages.append(
|
| 335 |
-
{"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"}
|
| 336 |
-
)
|
| 337 |
return messages
|
| 338 |
|
| 339 |
def format_reporting_input(
|
|
@@ -390,9 +388,7 @@ class Maira2Processor(LlavaProcessor):
|
|
| 390 |
assistant_text=assistant_text,
|
| 391 |
)
|
| 392 |
add_generation_prompt = assistant_text is None
|
| 393 |
-
text = self.tokenizer.apply_chat_template(
|
| 394 |
-
messages, add_generation_prompt=add_generation_prompt, tokenize=False
|
| 395 |
-
)
|
| 396 |
return text, images
|
| 397 |
|
| 398 |
def format_phrase_grounding_input(
|
|
@@ -423,9 +419,7 @@ class Maira2Processor(LlavaProcessor):
|
|
| 423 |
)
|
| 424 |
messages = self._construct_chat_messages_phrase_grounding(phrase)
|
| 425 |
add_generation_prompt = assistant_text is None
|
| 426 |
-
text = self.tokenizer.apply_chat_template(
|
| 427 |
-
messages, add_generation_prompt=add_generation_prompt, tokenize=False
|
| 428 |
-
)
|
| 429 |
return text, images
|
| 430 |
|
| 431 |
def format_and_preprocess_reporting_input(
|
|
@@ -548,7 +542,9 @@ class Maira2Processor(LlavaProcessor):
|
|
| 548 |
assert len(text) == 0
|
| 549 |
return split_text
|
| 550 |
|
| 551 |
-
def convert_output_to_plaintext_or_grounded_sequence(
|
|
|
|
|
|
|
| 552 |
"""
|
| 553 |
This function converts the input text to a grounded sequence by extracting the grounded phrases and bounding
|
| 554 |
boxes from the text. If the text is plaintext without any grounded phrases, it returns the text as is.
|
|
@@ -703,7 +699,6 @@ class Maira2Processor(LlavaProcessor):
|
|
| 703 |
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
| 704 |
**kwargs,
|
| 705 |
)
|
| 706 |
-
|
| 707 |
if images is not None:
|
| 708 |
image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
|
| 709 |
else:
|
|
@@ -730,6 +725,6 @@ class Maira2Processor(LlavaProcessor):
|
|
| 730 |
sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
|
| 731 |
prompt_strings.append(sample)
|
| 732 |
|
| 733 |
-
output_kwargs.pop("return_mm_token_type_ids")
|
| 734 |
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
| 735 |
-
return BatchFeature(data={**text_inputs, **image_inputs})
|
|
|
|
| 55 |
self,
|
| 56 |
image_processor: BaseImageProcessor = None,
|
| 57 |
tokenizer: PreTrainedTokenizer = None,
|
| 58 |
+
patch_size = None,
|
| 59 |
+
vision_feature_select_strategy = None,
|
| 60 |
+
chat_template = None,
|
| 61 |
image_token: str = "<image>",
|
| 62 |
phrase_start_token: str = "<obj>",
|
| 63 |
phrase_end_token: str = "</obj>",
|
|
|
|
| 301 |
)
|
| 302 |
messages = [{"content": prompt, "role": "user"}]
|
| 303 |
if assistant_text is not None:
|
| 304 |
+
messages.append({"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"})
|
|
|
|
|
|
|
| 305 |
return messages
|
| 306 |
|
| 307 |
+
def _construct_chat_messages_phrase_grounding(
|
| 308 |
+
self, phrase: str, assistant_text: str = None
|
| 309 |
+
):
|
| 310 |
"""
|
| 311 |
This function constructs the chat messages for phrase grounding used in the phrase grounding task.
|
| 312 |
|
|
|
|
| 331 |
]
|
| 332 |
messages = [{"content": prompt, "role": "user"}]
|
| 333 |
if assistant_text is not None:
|
| 334 |
+
messages.append({"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"})
|
|
|
|
|
|
|
| 335 |
return messages
|
| 336 |
|
| 337 |
def format_reporting_input(
|
|
|
|
| 388 |
assistant_text=assistant_text,
|
| 389 |
)
|
| 390 |
add_generation_prompt = assistant_text is None
|
| 391 |
+
text = self.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False)
|
|
|
|
|
|
|
| 392 |
return text, images
|
| 393 |
|
| 394 |
def format_phrase_grounding_input(
|
|
|
|
| 419 |
)
|
| 420 |
messages = self._construct_chat_messages_phrase_grounding(phrase)
|
| 421 |
add_generation_prompt = assistant_text is None
|
| 422 |
+
text = self.tokenizer.apply_chat_template(messages, add_generation_prompt=add_generation_prompt, tokenize=False)
|
|
|
|
|
|
|
| 423 |
return text, images
|
| 424 |
|
| 425 |
def format_and_preprocess_reporting_input(
|
|
|
|
| 542 |
assert len(text) == 0
|
| 543 |
return split_text
|
| 544 |
|
| 545 |
+
def convert_output_to_plaintext_or_grounded_sequence(
|
| 546 |
+
self, text: str
|
| 547 |
+
):
|
| 548 |
"""
|
| 549 |
This function converts the input text to a grounded sequence by extracting the grounded phrases and bounding
|
| 550 |
boxes from the text. If the text is plaintext without any grounded phrases, it returns the text as is.
|
|
|
|
| 699 |
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
| 700 |
**kwargs,
|
| 701 |
)
|
|
|
|
| 702 |
if images is not None:
|
| 703 |
image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
|
| 704 |
else:
|
|
|
|
| 725 |
sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
|
| 726 |
prompt_strings.append(sample)
|
| 727 |
|
| 728 |
+
output_kwargs.pop("return_mm_token_type_ids", None)
|
| 729 |
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
| 730 |
+
return BatchFeature(data={**text_inputs, **image_inputs})
|
tokenizer.model
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|