Upload folder using huggingface_hub
Browse files- .hfd/aria2c_urls.txt +0 -0
- .hfd/last_download_command +1 -0
- .hfd/repo_metadata.json +1 -0
- configuration_r.py +0 -3
- modeling_r.py +2 -84
- processing_r.py +4 -31
.hfd/aria2c_urls.txt
ADDED
File without changes
|
.hfd/last_download_command
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
REPO_ID=YannQi/R-4B TOOL=aria2c INCLUDE_PATTERNS= EXCLUDE_PATTERNS= DATASET=0 HF_USERNAME= HF_TOKEN= HF_TOKEN=https://huggingface.co REVISION=main
|
.hfd/repo_metadata.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_id":"6899c7b833b8a4a0398a0ed2","id":"YannQi/R-4B","private":false,"pipeline_tag":"visual-question-answering","tags":["safetensors","R","visual-question-answering","custom_code","en","base_model:Qwen/Qwen3-4B","base_model:finetune:Qwen/Qwen3-4B","license:apache-2.0","region:us"],"downloads":0,"likes":3,"modelId":"YannQi/R-4B","author":"YannQi","sha":"9fcd58d9d7b03add99ea92df619b24fa60a0e1ac","lastModified":"2025-08-11T11:55:23.000Z","gated":false,"disabled":false,"model-index":null,"config":{"auto_map":{"AutoConfig":"configuration_r.RConfig","AutoModel":"modeling_r.RForConditionalGeneration","AutoModelForCausalLM":"modeling_r.RForConditionalGeneration"},"architectures":["RForConditionalGeneration"],"model_type":"R","tokenizer_config":{"bos_token":null,"eos_token":"<|im_end|>","pad_token":"<|endoftext|>","unk_token":null},"chat_template_jinja":"{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'}}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all video then #}{% for content in message['content'] | selectattr('type', 'equalto', 'video') %}{{ '<video>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] }}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] }}{% endgeneration %}{% endfor %}{% endif %}{{'<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<think>' }}{% endif %}{%- if add_generation_prompt %}{%- if thinking_mode is defined and thinking_mode == 'short' %}{{- '\n\n</think>\n\n' }}{%- endif %}{%- if thinking_mode is defined and thinking_mode == 'long' %}{{- '\n' }}{%- endif %}{%- endif %}\n"},"cardData":{"license":"apache-2.0","language":["en"],"base_model":["Qwen/Qwen3-4B"],"pipeline_tag":"visual-question-answering"},"siblings":[{"rfilename":".gitattributes"},{"rfilename":"README.md"},{"rfilename":"added_tokens.json"},{"rfilename":"asset/R-4B.png"},{"rfilename":"asset/performance.png"},{"rfilename":"chat_template.jinja"},{"rfilename":"config.json"},{"rfilename":"configuration_r.py"},{"rfilename":"generation_config.json"},{"rfilename":"image_processing_r.py"},{"rfilename":"image_processing_r_fast.py"},{"rfilename":"merges.txt"},{"rfilename":"model-00001-of-00003.safetensors"},{"rfilename":"model-00002-of-00003.safetensors"},{"rfilename":"model-00003-of-00003.safetensors"},{"rfilename":"model.safetensors.index.json"},{"rfilename":"modeling_r.py"},{"rfilename":"preprocessor_config.json"},{"rfilename":"processing_r.py"},{"rfilename":"processor_config.json"},{"rfilename":"special_tokens_map.json"},{"rfilename":"tokenizer.json"},{"rfilename":"tokenizer_config.json"},{"rfilename":"video_preprocessor_config.json"},{"rfilename":"vocab.json"}],"spaces":[],"createdAt":"2025-08-11T10:36:40.000Z","safetensors":{"parameters":{"BF16":4819012384},"total":4819012384},"usedStorage":9653302738}
|
configuration_r.py
CHANGED
@@ -27,7 +27,6 @@ class RConfig(PretrainedConfig):
|
|
27 |
model_type = "R"
|
28 |
attribute_map = {
|
29 |
"image_token_id": "image_token_index",
|
30 |
-
"video_token_id": "video_token_index",
|
31 |
}
|
32 |
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
33 |
|
@@ -36,7 +35,6 @@ class RConfig(PretrainedConfig):
|
|
36 |
vision_config=None,
|
37 |
text_config=None,
|
38 |
image_token_index=151646,
|
39 |
-
video_token_index=151647,
|
40 |
projector_hidden_act="gelu",
|
41 |
vision_feature_select_strategy="full",
|
42 |
vision_feature_layer=-1,
|
@@ -48,7 +46,6 @@ class RConfig(PretrainedConfig):
|
|
48 |
**kwargs,
|
49 |
):
|
50 |
self.image_token_index = image_token_index
|
51 |
-
self.video_token_index = video_token_index
|
52 |
self.projector_hidden_act = projector_hidden_act
|
53 |
self.multimodal_projector_bias = multimodal_projector_bias
|
54 |
|
|
|
27 |
model_type = "R"
|
28 |
attribute_map = {
|
29 |
"image_token_id": "image_token_index",
|
|
|
30 |
}
|
31 |
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
32 |
|
|
|
35 |
vision_config=None,
|
36 |
text_config=None,
|
37 |
image_token_index=151646,
|
|
|
38 |
projector_hidden_act="gelu",
|
39 |
vision_feature_select_strategy="full",
|
40 |
vision_feature_layer=-1,
|
|
|
46 |
**kwargs,
|
47 |
):
|
48 |
self.image_token_index = image_token_index
|
|
|
49 |
self.projector_hidden_act = projector_hidden_act
|
50 |
self.multimodal_projector_bias = multimodal_projector_bias
|
51 |
|
modeling_r.py
CHANGED
@@ -44,8 +44,6 @@ class RModelOutputWithPast(BaseModelOutputWithPast):
|
|
44 |
|
45 |
image_hidden_states: Optional[torch.FloatTensor] = None
|
46 |
|
47 |
-
video_hidden_states: Optional[torch.FloatTensor] = None
|
48 |
-
|
49 |
|
50 |
@dataclass
|
51 |
class RCausalLMOutputWithPast(ModelOutput):
|
@@ -57,8 +55,6 @@ class RCausalLMOutputWithPast(ModelOutput):
|
|
57 |
attentions: Optional[tuple[torch.FloatTensor]] = None
|
58 |
image_hidden_states: Optional[torch.FloatTensor] = None
|
59 |
|
60 |
-
video_hidden_states: Optional[torch.FloatTensor] = None
|
61 |
-
|
62 |
|
63 |
class RPooler(nn.Module):
|
64 |
def __init__(self, config):
|
@@ -364,8 +360,6 @@ class RModel(RPreTrainedModel):
|
|
364 |
input_ids: torch.LongTensor = None,
|
365 |
pixel_values: torch.FloatTensor = None,
|
366 |
image_sizes: Optional[torch.LongTensor] = None,
|
367 |
-
pixel_values_videos: torch.FloatTensor = None,
|
368 |
-
image_sizes_videos: Optional[torch.LongTensor] = None,
|
369 |
attention_mask: Optional[torch.Tensor] = None,
|
370 |
position_ids: Optional[torch.LongTensor] = None,
|
371 |
past_key_values: Optional[list[torch.FloatTensor]] = None,
|
@@ -403,9 +397,9 @@ class RModel(RPreTrainedModel):
|
|
403 |
if (input_ids is None) ^ (inputs_embeds is not None):
|
404 |
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
405 |
|
406 |
-
if
|
407 |
raise ValueError(
|
408 |
-
"You cannot specify both `pixel_values
|
409 |
"and must specify either one"
|
410 |
)
|
411 |
if inputs_embeds is None:
|
@@ -434,30 +428,6 @@ class RModel(RPreTrainedModel):
|
|
434 |
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
435 |
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
436 |
|
437 |
-
# Video are simply embedded and further pooled to decrease seq len
|
438 |
-
if pixel_values_videos is not None:
|
439 |
-
video_features = self.get_video_features(
|
440 |
-
pixel_values_videos,
|
441 |
-
vision_feature_layer=vision_feature_layer,
|
442 |
-
vision_feature_select_strategy=vision_feature_select_strategy,
|
443 |
-
)
|
444 |
-
image_newline = (
|
445 |
-
self.image_newline[None, None, :].repeat(video_features.shape[0], 1, 1).to(video_features.device)
|
446 |
-
)
|
447 |
-
video_features = torch.cat((video_features, image_newline), dim=1)
|
448 |
-
video_features = video_features.flatten(0, 1)
|
449 |
-
|
450 |
-
special_video_mask = (input_ids == self.config.video_token_id).unsqueeze(-1)
|
451 |
-
special_video_mask = special_video_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
|
452 |
-
if not is_torchdynamo_compiling() and inputs_embeds[special_video_mask].numel() != video_features.numel():
|
453 |
-
n_video_tokens = (input_ids == self.config.video_token_id).sum()
|
454 |
-
n_video_features = video_features.shape[0]
|
455 |
-
raise ValueError(
|
456 |
-
f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
|
457 |
-
)
|
458 |
-
video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
459 |
-
inputs_embeds = inputs_embeds.masked_scatter(special_video_mask, video_features)
|
460 |
-
|
461 |
outputs = self.language_model(
|
462 |
attention_mask=attention_mask,
|
463 |
position_ids=position_ids,
|
@@ -477,7 +447,6 @@ class RModel(RPreTrainedModel):
|
|
477 |
hidden_states=outputs.hidden_states,
|
478 |
attentions=outputs.attentions,
|
479 |
image_hidden_states=image_features if pixel_values is not None else None,
|
480 |
-
video_hidden_states=video_features if pixel_values_videos is not None else None,
|
481 |
)
|
482 |
|
483 |
def apply_pooling(self, image_features):
|
@@ -494,36 +463,6 @@ class RModel(RPreTrainedModel):
|
|
494 |
image_features = image_features.view(batch_frames, -1, dim)
|
495 |
return image_features
|
496 |
|
497 |
-
def get_video_features(
|
498 |
-
self,
|
499 |
-
pixel_values: torch.FloatTensor,
|
500 |
-
vision_feature_layer: Union[int, list[int]],
|
501 |
-
vision_feature_select_strategy: str,
|
502 |
-
):
|
503 |
-
batch_size, frames, channels, height, width = pixel_values.shape
|
504 |
-
pixel_values = pixel_values.view(batch_size * frames, channels, height, width)
|
505 |
-
video_features = self.vision_tower(pixel_values, output_hidden_states=True)
|
506 |
-
|
507 |
-
# If we have one vision feature layer, return the corresponding hidden states,
|
508 |
-
# otherwise, select the hidden states of each feature layer and concatenate them
|
509 |
-
if isinstance(vision_feature_layer, int):
|
510 |
-
selected_video_feature = video_features.hidden_states[vision_feature_layer]
|
511 |
-
else:
|
512 |
-
hs_pool = [video_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
|
513 |
-
selected_video_feature = torch.cat(hs_pool, dim=-1)
|
514 |
-
|
515 |
-
if vision_feature_select_strategy == "default":
|
516 |
-
selected_video_feature = selected_video_feature[:, 1:]
|
517 |
-
elif vision_feature_select_strategy == "full":
|
518 |
-
selected_video_feature = selected_video_feature
|
519 |
-
video_features = self.multi_modal_projector(selected_video_feature)
|
520 |
-
|
521 |
-
video_features = self.apply_pooling(video_features)
|
522 |
-
video_features = video_features.reshape(batch_size, frames * video_features.shape[1], -1)
|
523 |
-
|
524 |
-
return video_features
|
525 |
-
|
526 |
-
|
527 |
class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
|
528 |
_checkpoint_conversion_mapping = {
|
529 |
"^language_model.model": "model.language_model",
|
@@ -599,8 +538,6 @@ class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
|
|
599 |
input_ids: torch.LongTensor = None,
|
600 |
pixel_values: torch.FloatTensor = None,
|
601 |
image_sizes: Optional[torch.LongTensor] = None,
|
602 |
-
pixel_values_videos: torch.FloatTensor = None,
|
603 |
-
image_sizes_videos: Optional[torch.LongTensor] = None,
|
604 |
attention_mask: Optional[torch.Tensor] = None,
|
605 |
position_ids: Optional[torch.LongTensor] = None,
|
606 |
past_key_values: Optional[list[torch.FloatTensor]] = None,
|
@@ -641,9 +578,7 @@ class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
|
|
641 |
outputs = self.model(
|
642 |
input_ids=input_ids,
|
643 |
pixel_values=pixel_values,
|
644 |
-
pixel_values_videos=pixel_values_videos,
|
645 |
image_sizes=image_sizes,
|
646 |
-
image_sizes_videos=image_sizes_videos,
|
647 |
vision_aspect_ratio=vision_aspect_ratio,
|
648 |
vision_feature_layer=vision_feature_layer,
|
649 |
vision_feature_select_strategy=vision_feature_select_strategy,
|
@@ -679,7 +614,6 @@ class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
|
|
679 |
hidden_states=outputs.hidden_states,
|
680 |
attentions=outputs.attentions,
|
681 |
image_hidden_states=outputs.image_hidden_states,
|
682 |
-
video_hidden_states=outputs.video_hidden_states,
|
683 |
)
|
684 |
|
685 |
def prepare_inputs_for_generation(
|
@@ -689,8 +623,6 @@ class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
|
|
689 |
inputs_embeds=None,
|
690 |
pixel_values=None,
|
691 |
image_sizes=None,
|
692 |
-
pixel_values_videos=None,
|
693 |
-
image_sizes_videos=None,
|
694 |
attention_mask=None,
|
695 |
cache_position=None,
|
696 |
logits_to_keep=None,
|
@@ -713,8 +645,6 @@ class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
|
|
713 |
# Otherwise we need pixel values to be passed to model
|
714 |
model_inputs["pixel_values"] = pixel_values
|
715 |
model_inputs["image_sizes"] = image_sizes
|
716 |
-
model_inputs["pixel_values_videos"] = pixel_values_videos
|
717 |
-
model_inputs["image_sizes_videos"] = image_sizes_videos
|
718 |
|
719 |
return model_inputs
|
720 |
|
@@ -754,17 +684,5 @@ class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
|
|
754 |
|
755 |
return causal_mask
|
756 |
|
757 |
-
def get_video_features(
|
758 |
-
self,
|
759 |
-
pixel_values: torch.FloatTensor,
|
760 |
-
vision_feature_layer: Optional[Union[int, list[int]]] = None,
|
761 |
-
vision_feature_select_strategy: Optional[str] = None,
|
762 |
-
):
|
763 |
-
return self.model.get_video_features(
|
764 |
-
pixel_values=pixel_values,
|
765 |
-
vision_feature_layer=vision_feature_layer,
|
766 |
-
vision_feature_select_strategy=vision_feature_select_strategy,
|
767 |
-
)
|
768 |
-
|
769 |
|
770 |
__all__ = ["RModel", "RForConditionalGeneration", "RPreTrainedModel"]
|
|
|
44 |
|
45 |
image_hidden_states: Optional[torch.FloatTensor] = None
|
46 |
|
|
|
|
|
47 |
|
48 |
@dataclass
|
49 |
class RCausalLMOutputWithPast(ModelOutput):
|
|
|
55 |
attentions: Optional[tuple[torch.FloatTensor]] = None
|
56 |
image_hidden_states: Optional[torch.FloatTensor] = None
|
57 |
|
|
|
|
|
58 |
|
59 |
class RPooler(nn.Module):
|
60 |
def __init__(self, config):
|
|
|
360 |
input_ids: torch.LongTensor = None,
|
361 |
pixel_values: torch.FloatTensor = None,
|
362 |
image_sizes: Optional[torch.LongTensor] = None,
|
|
|
|
|
363 |
attention_mask: Optional[torch.Tensor] = None,
|
364 |
position_ids: Optional[torch.LongTensor] = None,
|
365 |
past_key_values: Optional[list[torch.FloatTensor]] = None,
|
|
|
397 |
if (input_ids is None) ^ (inputs_embeds is not None):
|
398 |
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
399 |
|
400 |
+
if pixel_values is not None and inputs_embeds is not None:
|
401 |
raise ValueError(
|
402 |
+
"You cannot specify both `pixel_values` and `inputs_embeds` at the same time, "
|
403 |
"and must specify either one"
|
404 |
)
|
405 |
if inputs_embeds is None:
|
|
|
428 |
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
429 |
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
430 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
431 |
outputs = self.language_model(
|
432 |
attention_mask=attention_mask,
|
433 |
position_ids=position_ids,
|
|
|
447 |
hidden_states=outputs.hidden_states,
|
448 |
attentions=outputs.attentions,
|
449 |
image_hidden_states=image_features if pixel_values is not None else None,
|
|
|
450 |
)
|
451 |
|
452 |
def apply_pooling(self, image_features):
|
|
|
463 |
image_features = image_features.view(batch_frames, -1, dim)
|
464 |
return image_features
|
465 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
466 |
class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
|
467 |
_checkpoint_conversion_mapping = {
|
468 |
"^language_model.model": "model.language_model",
|
|
|
538 |
input_ids: torch.LongTensor = None,
|
539 |
pixel_values: torch.FloatTensor = None,
|
540 |
image_sizes: Optional[torch.LongTensor] = None,
|
|
|
|
|
541 |
attention_mask: Optional[torch.Tensor] = None,
|
542 |
position_ids: Optional[torch.LongTensor] = None,
|
543 |
past_key_values: Optional[list[torch.FloatTensor]] = None,
|
|
|
578 |
outputs = self.model(
|
579 |
input_ids=input_ids,
|
580 |
pixel_values=pixel_values,
|
|
|
581 |
image_sizes=image_sizes,
|
|
|
582 |
vision_aspect_ratio=vision_aspect_ratio,
|
583 |
vision_feature_layer=vision_feature_layer,
|
584 |
vision_feature_select_strategy=vision_feature_select_strategy,
|
|
|
614 |
hidden_states=outputs.hidden_states,
|
615 |
attentions=outputs.attentions,
|
616 |
image_hidden_states=outputs.image_hidden_states,
|
|
|
617 |
)
|
618 |
|
619 |
def prepare_inputs_for_generation(
|
|
|
623 |
inputs_embeds=None,
|
624 |
pixel_values=None,
|
625 |
image_sizes=None,
|
|
|
|
|
626 |
attention_mask=None,
|
627 |
cache_position=None,
|
628 |
logits_to_keep=None,
|
|
|
645 |
# Otherwise we need pixel values to be passed to model
|
646 |
model_inputs["pixel_values"] = pixel_values
|
647 |
model_inputs["image_sizes"] = image_sizes
|
|
|
|
|
648 |
|
649 |
return model_inputs
|
650 |
|
|
|
684 |
|
685 |
return causal_mask
|
686 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
687 |
|
688 |
__all__ = ["RModel", "RForConditionalGeneration", "RPreTrainedModel"]
|
processing_r.py
CHANGED
@@ -36,61 +36,49 @@ class RProcessorKwargs(ProcessingKwargs, total=False):
|
|
36 |
|
37 |
},
|
38 |
"image_kwargs": {},
|
39 |
-
"videos_kwargs": {},
|
40 |
}
|
41 |
|
42 |
|
43 |
class RProcessor(ProcessorMixin):
|
44 |
-
attributes = ["image_processor", "tokenizer"
|
45 |
valid_kwargs = [
|
46 |
"chat_template",
|
47 |
"num_image_tokens",
|
48 |
"image_processor_type",
|
49 |
"vision_feature_select_strategy",
|
50 |
"image_token",
|
51 |
-
"video_token",
|
52 |
"vision_aspect_ratio",
|
53 |
]
|
54 |
image_processor_class = "AutoImageProcessor"
|
55 |
tokenizer_class = "AutoTokenizer"
|
56 |
-
video_processor_class = "AutoVideoProcessor"
|
57 |
|
58 |
def __init__(
|
59 |
self,
|
60 |
image_processor=None,
|
61 |
tokenizer=None,
|
62 |
-
video_processor=None,
|
63 |
num_image_tokens=None,
|
64 |
vision_feature_select_strategy=None,
|
65 |
chat_template=None,
|
66 |
image_token="<image>",
|
67 |
-
video_token="<video>",
|
68 |
vision_aspect_ratio= "anyres",
|
69 |
**kwargs,
|
70 |
):
|
71 |
self.num_image_tokens = num_image_tokens
|
72 |
self.vision_feature_select_strategy = vision_feature_select_strategy
|
73 |
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
|
74 |
-
self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
|
75 |
self.image_token_id = (
|
76 |
tokenizer.image_token_id
|
77 |
if getattr(tokenizer, "image_token_id", None)
|
78 |
else tokenizer.convert_tokens_to_ids(self.image_token)
|
79 |
)
|
80 |
-
self.video_token_id = (
|
81 |
-
tokenizer.video_token_id
|
82 |
-
if getattr(tokenizer, "video_token_id", None)
|
83 |
-
else tokenizer.convert_tokens_to_ids(self.video_token)
|
84 |
-
)
|
85 |
self.vision_aspect_ratio = vision_aspect_ratio
|
86 |
-
super().__init__(image_processor, tokenizer,
|
87 |
|
88 |
def __call__(
|
89 |
self,
|
90 |
images: ImageInput = None,
|
91 |
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
92 |
audio=None,
|
93 |
-
videos=None,
|
94 |
**kwargs: Unpack[RProcessorKwargs],
|
95 |
) -> BatchFeature:
|
96 |
output_kwargs = self._merge_kwargs(
|
@@ -104,7 +92,7 @@ class RProcessor(ProcessorMixin):
|
|
104 |
elif not isinstance(text, list) and not isinstance(text[0], str):
|
105 |
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
106 |
|
107 |
-
image_inputs =
|
108 |
|
109 |
if images is not None:
|
110 |
image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
|
@@ -119,28 +107,13 @@ class RProcessor(ProcessorMixin):
|
|
119 |
text, image_sizes, height, width, self.image_token, batch_num_images
|
120 |
)
|
121 |
|
122 |
-
if videos is not None:
|
123 |
-
video_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"])
|
124 |
-
|
125 |
-
one_video = video_inputs.get("pixel_values_videos")[0]
|
126 |
-
if isinstance(video_inputs.get("pixel_values_videos")[0], (list, tuple)):
|
127 |
-
one_video = np.array(one_video)
|
128 |
-
else:
|
129 |
-
one_video = to_numpy_array(one_video)
|
130 |
-
height, width = get_image_size(one_video[0], channel_dim=output_kwargs["images_kwargs"].get("data_format"))
|
131 |
-
num_frames = one_video.shape[0] # frame dim is always after batch dim
|
132 |
-
patches_height_width = int(math.sqrt(self.num_image_tokens))
|
133 |
-
pooled_height_width = math.ceil(patches_height_width / 2)
|
134 |
-
num_video_tokens = (num_frames * pooled_height_width * pooled_height_width) + 1 # +1 for newline token
|
135 |
-
text = [sample.replace(self.video_token, self.video_token * num_video_tokens) for sample in text]
|
136 |
-
|
137 |
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
138 |
|
139 |
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
140 |
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
|
141 |
|
142 |
|
143 |
-
return BatchFeature(data={**text_inputs, **image_inputs
|
144 |
|
145 |
def _expand_image_tokens(
|
146 |
self,
|
|
|
36 |
|
37 |
},
|
38 |
"image_kwargs": {},
|
|
|
39 |
}
|
40 |
|
41 |
|
42 |
class RProcessor(ProcessorMixin):
|
43 |
+
attributes = ["image_processor", "tokenizer"]
|
44 |
valid_kwargs = [
|
45 |
"chat_template",
|
46 |
"num_image_tokens",
|
47 |
"image_processor_type",
|
48 |
"vision_feature_select_strategy",
|
49 |
"image_token",
|
|
|
50 |
"vision_aspect_ratio",
|
51 |
]
|
52 |
image_processor_class = "AutoImageProcessor"
|
53 |
tokenizer_class = "AutoTokenizer"
|
|
|
54 |
|
55 |
def __init__(
|
56 |
self,
|
57 |
image_processor=None,
|
58 |
tokenizer=None,
|
|
|
59 |
num_image_tokens=None,
|
60 |
vision_feature_select_strategy=None,
|
61 |
chat_template=None,
|
62 |
image_token="<image>",
|
|
|
63 |
vision_aspect_ratio= "anyres",
|
64 |
**kwargs,
|
65 |
):
|
66 |
self.num_image_tokens = num_image_tokens
|
67 |
self.vision_feature_select_strategy = vision_feature_select_strategy
|
68 |
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
|
|
|
69 |
self.image_token_id = (
|
70 |
tokenizer.image_token_id
|
71 |
if getattr(tokenizer, "image_token_id", None)
|
72 |
else tokenizer.convert_tokens_to_ids(self.image_token)
|
73 |
)
|
|
|
|
|
|
|
|
|
|
|
74 |
self.vision_aspect_ratio = vision_aspect_ratio
|
75 |
+
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
76 |
|
77 |
def __call__(
|
78 |
self,
|
79 |
images: ImageInput = None,
|
80 |
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
81 |
audio=None,
|
|
|
82 |
**kwargs: Unpack[RProcessorKwargs],
|
83 |
) -> BatchFeature:
|
84 |
output_kwargs = self._merge_kwargs(
|
|
|
92 |
elif not isinstance(text, list) and not isinstance(text[0], str):
|
93 |
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
94 |
|
95 |
+
image_inputs = {}
|
96 |
|
97 |
if images is not None:
|
98 |
image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
|
|
|
107 |
text, image_sizes, height, width, self.image_token, batch_num_images
|
108 |
)
|
109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
111 |
|
112 |
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
113 |
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
|
114 |
|
115 |
|
116 |
+
return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
|
117 |
|
118 |
def _expand_image_tokens(
|
119 |
self,
|