YannQi commited on
Commit
3c94c68
·
verified ·
1 Parent(s): 9fcd58d

Upload folder using huggingface_hub

Browse files
.hfd/aria2c_urls.txt ADDED
File without changes
.hfd/last_download_command ADDED
@@ -0,0 +1 @@
 
 
1
+ REPO_ID=YannQi/R-4B TOOL=aria2c INCLUDE_PATTERNS= EXCLUDE_PATTERNS= DATASET=0 HF_USERNAME= HF_TOKEN= HF_TOKEN=https://huggingface.co REVISION=main
.hfd/repo_metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_id":"6899c7b833b8a4a0398a0ed2","id":"YannQi/R-4B","private":false,"pipeline_tag":"visual-question-answering","tags":["safetensors","R","visual-question-answering","custom_code","en","base_model:Qwen/Qwen3-4B","base_model:finetune:Qwen/Qwen3-4B","license:apache-2.0","region:us"],"downloads":0,"likes":3,"modelId":"YannQi/R-4B","author":"YannQi","sha":"9fcd58d9d7b03add99ea92df619b24fa60a0e1ac","lastModified":"2025-08-11T11:55:23.000Z","gated":false,"disabled":false,"model-index":null,"config":{"auto_map":{"AutoConfig":"configuration_r.RConfig","AutoModel":"modeling_r.RForConditionalGeneration","AutoModelForCausalLM":"modeling_r.RForConditionalGeneration"},"architectures":["RForConditionalGeneration"],"model_type":"R","tokenizer_config":{"bos_token":null,"eos_token":"<|im_end|>","pad_token":"<|endoftext|>","unk_token":null},"chat_template_jinja":"{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'}}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all video then #}{% for content in message['content'] | selectattr('type', 'equalto', 'video') %}{{ '<video>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] }}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] }}{% endgeneration %}{% endfor %}{% endif %}{{'<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<think>' }}{% endif %}{%- if add_generation_prompt %}{%- if thinking_mode is defined and thinking_mode == 'short' %}{{- '\n\n</think>\n\n' }}{%- endif %}{%- if thinking_mode is defined and thinking_mode == 'long' %}{{- '\n' }}{%- endif %}{%- endif %}\n"},"cardData":{"license":"apache-2.0","language":["en"],"base_model":["Qwen/Qwen3-4B"],"pipeline_tag":"visual-question-answering"},"siblings":[{"rfilename":".gitattributes"},{"rfilename":"README.md"},{"rfilename":"added_tokens.json"},{"rfilename":"asset/R-4B.png"},{"rfilename":"asset/performance.png"},{"rfilename":"chat_template.jinja"},{"rfilename":"config.json"},{"rfilename":"configuration_r.py"},{"rfilename":"generation_config.json"},{"rfilename":"image_processing_r.py"},{"rfilename":"image_processing_r_fast.py"},{"rfilename":"merges.txt"},{"rfilename":"model-00001-of-00003.safetensors"},{"rfilename":"model-00002-of-00003.safetensors"},{"rfilename":"model-00003-of-00003.safetensors"},{"rfilename":"model.safetensors.index.json"},{"rfilename":"modeling_r.py"},{"rfilename":"preprocessor_config.json"},{"rfilename":"processing_r.py"},{"rfilename":"processor_config.json"},{"rfilename":"special_tokens_map.json"},{"rfilename":"tokenizer.json"},{"rfilename":"tokenizer_config.json"},{"rfilename":"video_preprocessor_config.json"},{"rfilename":"vocab.json"}],"spaces":[],"createdAt":"2025-08-11T10:36:40.000Z","safetensors":{"parameters":{"BF16":4819012384},"total":4819012384},"usedStorage":9653302738}
configuration_r.py CHANGED
@@ -27,7 +27,6 @@ class RConfig(PretrainedConfig):
27
  model_type = "R"
28
  attribute_map = {
29
  "image_token_id": "image_token_index",
30
- "video_token_id": "video_token_index",
31
  }
32
  sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
33
 
@@ -36,7 +35,6 @@ class RConfig(PretrainedConfig):
36
  vision_config=None,
37
  text_config=None,
38
  image_token_index=151646,
39
- video_token_index=151647,
40
  projector_hidden_act="gelu",
41
  vision_feature_select_strategy="full",
42
  vision_feature_layer=-1,
@@ -48,7 +46,6 @@ class RConfig(PretrainedConfig):
48
  **kwargs,
49
  ):
50
  self.image_token_index = image_token_index
51
- self.video_token_index = video_token_index
52
  self.projector_hidden_act = projector_hidden_act
53
  self.multimodal_projector_bias = multimodal_projector_bias
54
 
 
27
  model_type = "R"
28
  attribute_map = {
29
  "image_token_id": "image_token_index",
 
30
  }
31
  sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
32
 
 
35
  vision_config=None,
36
  text_config=None,
37
  image_token_index=151646,
 
38
  projector_hidden_act="gelu",
39
  vision_feature_select_strategy="full",
40
  vision_feature_layer=-1,
 
46
  **kwargs,
47
  ):
48
  self.image_token_index = image_token_index
 
49
  self.projector_hidden_act = projector_hidden_act
50
  self.multimodal_projector_bias = multimodal_projector_bias
51
 
modeling_r.py CHANGED
@@ -44,8 +44,6 @@ class RModelOutputWithPast(BaseModelOutputWithPast):
44
 
45
  image_hidden_states: Optional[torch.FloatTensor] = None
46
 
47
- video_hidden_states: Optional[torch.FloatTensor] = None
48
-
49
 
50
  @dataclass
51
  class RCausalLMOutputWithPast(ModelOutput):
@@ -57,8 +55,6 @@ class RCausalLMOutputWithPast(ModelOutput):
57
  attentions: Optional[tuple[torch.FloatTensor]] = None
58
  image_hidden_states: Optional[torch.FloatTensor] = None
59
 
60
- video_hidden_states: Optional[torch.FloatTensor] = None
61
-
62
 
63
  class RPooler(nn.Module):
64
  def __init__(self, config):
@@ -364,8 +360,6 @@ class RModel(RPreTrainedModel):
364
  input_ids: torch.LongTensor = None,
365
  pixel_values: torch.FloatTensor = None,
366
  image_sizes: Optional[torch.LongTensor] = None,
367
- pixel_values_videos: torch.FloatTensor = None,
368
- image_sizes_videos: Optional[torch.LongTensor] = None,
369
  attention_mask: Optional[torch.Tensor] = None,
370
  position_ids: Optional[torch.LongTensor] = None,
371
  past_key_values: Optional[list[torch.FloatTensor]] = None,
@@ -403,9 +397,9 @@ class RModel(RPreTrainedModel):
403
  if (input_ids is None) ^ (inputs_embeds is not None):
404
  raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
405
 
406
- if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None:
407
  raise ValueError(
408
- "You cannot specify both `pixel_values`/`pixel_values_videos` and `inputs_embeds` at the same time, "
409
  "and must specify either one"
410
  )
411
  if inputs_embeds is None:
@@ -434,30 +428,6 @@ class RModel(RPreTrainedModel):
434
  image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
435
  inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
436
 
437
- # Video are simply embedded and further pooled to decrease seq len
438
- if pixel_values_videos is not None:
439
- video_features = self.get_video_features(
440
- pixel_values_videos,
441
- vision_feature_layer=vision_feature_layer,
442
- vision_feature_select_strategy=vision_feature_select_strategy,
443
- )
444
- image_newline = (
445
- self.image_newline[None, None, :].repeat(video_features.shape[0], 1, 1).to(video_features.device)
446
- )
447
- video_features = torch.cat((video_features, image_newline), dim=1)
448
- video_features = video_features.flatten(0, 1)
449
-
450
- special_video_mask = (input_ids == self.config.video_token_id).unsqueeze(-1)
451
- special_video_mask = special_video_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
452
- if not is_torchdynamo_compiling() and inputs_embeds[special_video_mask].numel() != video_features.numel():
453
- n_video_tokens = (input_ids == self.config.video_token_id).sum()
454
- n_video_features = video_features.shape[0]
455
- raise ValueError(
456
- f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
457
- )
458
- video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
459
- inputs_embeds = inputs_embeds.masked_scatter(special_video_mask, video_features)
460
-
461
  outputs = self.language_model(
462
  attention_mask=attention_mask,
463
  position_ids=position_ids,
@@ -477,7 +447,6 @@ class RModel(RPreTrainedModel):
477
  hidden_states=outputs.hidden_states,
478
  attentions=outputs.attentions,
479
  image_hidden_states=image_features if pixel_values is not None else None,
480
- video_hidden_states=video_features if pixel_values_videos is not None else None,
481
  )
482
 
483
  def apply_pooling(self, image_features):
@@ -494,36 +463,6 @@ class RModel(RPreTrainedModel):
494
  image_features = image_features.view(batch_frames, -1, dim)
495
  return image_features
496
 
497
- def get_video_features(
498
- self,
499
- pixel_values: torch.FloatTensor,
500
- vision_feature_layer: Union[int, list[int]],
501
- vision_feature_select_strategy: str,
502
- ):
503
- batch_size, frames, channels, height, width = pixel_values.shape
504
- pixel_values = pixel_values.view(batch_size * frames, channels, height, width)
505
- video_features = self.vision_tower(pixel_values, output_hidden_states=True)
506
-
507
- # If we have one vision feature layer, return the corresponding hidden states,
508
- # otherwise, select the hidden states of each feature layer and concatenate them
509
- if isinstance(vision_feature_layer, int):
510
- selected_video_feature = video_features.hidden_states[vision_feature_layer]
511
- else:
512
- hs_pool = [video_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
513
- selected_video_feature = torch.cat(hs_pool, dim=-1)
514
-
515
- if vision_feature_select_strategy == "default":
516
- selected_video_feature = selected_video_feature[:, 1:]
517
- elif vision_feature_select_strategy == "full":
518
- selected_video_feature = selected_video_feature
519
- video_features = self.multi_modal_projector(selected_video_feature)
520
-
521
- video_features = self.apply_pooling(video_features)
522
- video_features = video_features.reshape(batch_size, frames * video_features.shape[1], -1)
523
-
524
- return video_features
525
-
526
-
527
  class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
528
  _checkpoint_conversion_mapping = {
529
  "^language_model.model": "model.language_model",
@@ -599,8 +538,6 @@ class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
599
  input_ids: torch.LongTensor = None,
600
  pixel_values: torch.FloatTensor = None,
601
  image_sizes: Optional[torch.LongTensor] = None,
602
- pixel_values_videos: torch.FloatTensor = None,
603
- image_sizes_videos: Optional[torch.LongTensor] = None,
604
  attention_mask: Optional[torch.Tensor] = None,
605
  position_ids: Optional[torch.LongTensor] = None,
606
  past_key_values: Optional[list[torch.FloatTensor]] = None,
@@ -641,9 +578,7 @@ class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
641
  outputs = self.model(
642
  input_ids=input_ids,
643
  pixel_values=pixel_values,
644
- pixel_values_videos=pixel_values_videos,
645
  image_sizes=image_sizes,
646
- image_sizes_videos=image_sizes_videos,
647
  vision_aspect_ratio=vision_aspect_ratio,
648
  vision_feature_layer=vision_feature_layer,
649
  vision_feature_select_strategy=vision_feature_select_strategy,
@@ -679,7 +614,6 @@ class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
679
  hidden_states=outputs.hidden_states,
680
  attentions=outputs.attentions,
681
  image_hidden_states=outputs.image_hidden_states,
682
- video_hidden_states=outputs.video_hidden_states,
683
  )
684
 
685
  def prepare_inputs_for_generation(
@@ -689,8 +623,6 @@ class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
689
  inputs_embeds=None,
690
  pixel_values=None,
691
  image_sizes=None,
692
- pixel_values_videos=None,
693
- image_sizes_videos=None,
694
  attention_mask=None,
695
  cache_position=None,
696
  logits_to_keep=None,
@@ -713,8 +645,6 @@ class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
713
  # Otherwise we need pixel values to be passed to model
714
  model_inputs["pixel_values"] = pixel_values
715
  model_inputs["image_sizes"] = image_sizes
716
- model_inputs["pixel_values_videos"] = pixel_values_videos
717
- model_inputs["image_sizes_videos"] = image_sizes_videos
718
 
719
  return model_inputs
720
 
@@ -754,17 +684,5 @@ class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
754
 
755
  return causal_mask
756
 
757
- def get_video_features(
758
- self,
759
- pixel_values: torch.FloatTensor,
760
- vision_feature_layer: Optional[Union[int, list[int]]] = None,
761
- vision_feature_select_strategy: Optional[str] = None,
762
- ):
763
- return self.model.get_video_features(
764
- pixel_values=pixel_values,
765
- vision_feature_layer=vision_feature_layer,
766
- vision_feature_select_strategy=vision_feature_select_strategy,
767
- )
768
-
769
 
770
  __all__ = ["RModel", "RForConditionalGeneration", "RPreTrainedModel"]
 
44
 
45
  image_hidden_states: Optional[torch.FloatTensor] = None
46
 
 
 
47
 
48
  @dataclass
49
  class RCausalLMOutputWithPast(ModelOutput):
 
55
  attentions: Optional[tuple[torch.FloatTensor]] = None
56
  image_hidden_states: Optional[torch.FloatTensor] = None
57
 
 
 
58
 
59
  class RPooler(nn.Module):
60
  def __init__(self, config):
 
360
  input_ids: torch.LongTensor = None,
361
  pixel_values: torch.FloatTensor = None,
362
  image_sizes: Optional[torch.LongTensor] = None,
 
 
363
  attention_mask: Optional[torch.Tensor] = None,
364
  position_ids: Optional[torch.LongTensor] = None,
365
  past_key_values: Optional[list[torch.FloatTensor]] = None,
 
397
  if (input_ids is None) ^ (inputs_embeds is not None):
398
  raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
399
 
400
+ if pixel_values is not None and inputs_embeds is not None:
401
  raise ValueError(
402
+ "You cannot specify both `pixel_values` and `inputs_embeds` at the same time, "
403
  "and must specify either one"
404
  )
405
  if inputs_embeds is None:
 
428
  image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
429
  inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
430
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
  outputs = self.language_model(
432
  attention_mask=attention_mask,
433
  position_ids=position_ids,
 
447
  hidden_states=outputs.hidden_states,
448
  attentions=outputs.attentions,
449
  image_hidden_states=image_features if pixel_values is not None else None,
 
450
  )
451
 
452
  def apply_pooling(self, image_features):
 
463
  image_features = image_features.view(batch_frames, -1, dim)
464
  return image_features
465
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
466
  class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
467
  _checkpoint_conversion_mapping = {
468
  "^language_model.model": "model.language_model",
 
538
  input_ids: torch.LongTensor = None,
539
  pixel_values: torch.FloatTensor = None,
540
  image_sizes: Optional[torch.LongTensor] = None,
 
 
541
  attention_mask: Optional[torch.Tensor] = None,
542
  position_ids: Optional[torch.LongTensor] = None,
543
  past_key_values: Optional[list[torch.FloatTensor]] = None,
 
578
  outputs = self.model(
579
  input_ids=input_ids,
580
  pixel_values=pixel_values,
 
581
  image_sizes=image_sizes,
 
582
  vision_aspect_ratio=vision_aspect_ratio,
583
  vision_feature_layer=vision_feature_layer,
584
  vision_feature_select_strategy=vision_feature_select_strategy,
 
614
  hidden_states=outputs.hidden_states,
615
  attentions=outputs.attentions,
616
  image_hidden_states=outputs.image_hidden_states,
 
617
  )
618
 
619
  def prepare_inputs_for_generation(
 
623
  inputs_embeds=None,
624
  pixel_values=None,
625
  image_sizes=None,
 
 
626
  attention_mask=None,
627
  cache_position=None,
628
  logits_to_keep=None,
 
645
  # Otherwise we need pixel values to be passed to model
646
  model_inputs["pixel_values"] = pixel_values
647
  model_inputs["image_sizes"] = image_sizes
 
 
648
 
649
  return model_inputs
650
 
 
684
 
685
  return causal_mask
686
 
 
 
 
 
 
 
 
 
 
 
 
 
687
 
688
  __all__ = ["RModel", "RForConditionalGeneration", "RPreTrainedModel"]
processing_r.py CHANGED
@@ -36,61 +36,49 @@ class RProcessorKwargs(ProcessingKwargs, total=False):
36
 
37
  },
38
  "image_kwargs": {},
39
- "videos_kwargs": {},
40
  }
41
 
42
 
43
  class RProcessor(ProcessorMixin):
44
- attributes = ["image_processor", "tokenizer", "video_processor"]
45
  valid_kwargs = [
46
  "chat_template",
47
  "num_image_tokens",
48
  "image_processor_type",
49
  "vision_feature_select_strategy",
50
  "image_token",
51
- "video_token",
52
  "vision_aspect_ratio",
53
  ]
54
  image_processor_class = "AutoImageProcessor"
55
  tokenizer_class = "AutoTokenizer"
56
- video_processor_class = "AutoVideoProcessor"
57
 
58
  def __init__(
59
  self,
60
  image_processor=None,
61
  tokenizer=None,
62
- video_processor=None,
63
  num_image_tokens=None,
64
  vision_feature_select_strategy=None,
65
  chat_template=None,
66
  image_token="<image>",
67
- video_token="<video>",
68
  vision_aspect_ratio= "anyres",
69
  **kwargs,
70
  ):
71
  self.num_image_tokens = num_image_tokens
72
  self.vision_feature_select_strategy = vision_feature_select_strategy
73
  self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
74
- self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
75
  self.image_token_id = (
76
  tokenizer.image_token_id
77
  if getattr(tokenizer, "image_token_id", None)
78
  else tokenizer.convert_tokens_to_ids(self.image_token)
79
  )
80
- self.video_token_id = (
81
- tokenizer.video_token_id
82
- if getattr(tokenizer, "video_token_id", None)
83
- else tokenizer.convert_tokens_to_ids(self.video_token)
84
- )
85
  self.vision_aspect_ratio = vision_aspect_ratio
86
- super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
87
 
88
  def __call__(
89
  self,
90
  images: ImageInput = None,
91
  text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
92
  audio=None,
93
- videos=None,
94
  **kwargs: Unpack[RProcessorKwargs],
95
  ) -> BatchFeature:
96
  output_kwargs = self._merge_kwargs(
@@ -104,7 +92,7 @@ class RProcessor(ProcessorMixin):
104
  elif not isinstance(text, list) and not isinstance(text[0], str):
105
  raise ValueError("Invalid input text. Please provide a string, or a list of strings")
106
 
107
- image_inputs = video_inputs = {}
108
 
109
  if images is not None:
110
  image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
@@ -119,28 +107,13 @@ class RProcessor(ProcessorMixin):
119
  text, image_sizes, height, width, self.image_token, batch_num_images
120
  )
121
 
122
- if videos is not None:
123
- video_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"])
124
-
125
- one_video = video_inputs.get("pixel_values_videos")[0]
126
- if isinstance(video_inputs.get("pixel_values_videos")[0], (list, tuple)):
127
- one_video = np.array(one_video)
128
- else:
129
- one_video = to_numpy_array(one_video)
130
- height, width = get_image_size(one_video[0], channel_dim=output_kwargs["images_kwargs"].get("data_format"))
131
- num_frames = one_video.shape[0] # frame dim is always after batch dim
132
- patches_height_width = int(math.sqrt(self.num_image_tokens))
133
- pooled_height_width = math.ceil(patches_height_width / 2)
134
- num_video_tokens = (num_frames * pooled_height_width * pooled_height_width) + 1 # +1 for newline token
135
- text = [sample.replace(self.video_token, self.video_token * num_video_tokens) for sample in text]
136
-
137
  return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
138
 
139
  text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
140
  self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
141
 
142
 
143
- return BatchFeature(data={**text_inputs, **image_inputs, **video_inputs}, tensor_type=return_tensors)
144
 
145
  def _expand_image_tokens(
146
  self,
 
36
 
37
  },
38
  "image_kwargs": {},
 
39
  }
40
 
41
 
42
  class RProcessor(ProcessorMixin):
43
+ attributes = ["image_processor", "tokenizer"]
44
  valid_kwargs = [
45
  "chat_template",
46
  "num_image_tokens",
47
  "image_processor_type",
48
  "vision_feature_select_strategy",
49
  "image_token",
 
50
  "vision_aspect_ratio",
51
  ]
52
  image_processor_class = "AutoImageProcessor"
53
  tokenizer_class = "AutoTokenizer"
 
54
 
55
  def __init__(
56
  self,
57
  image_processor=None,
58
  tokenizer=None,
 
59
  num_image_tokens=None,
60
  vision_feature_select_strategy=None,
61
  chat_template=None,
62
  image_token="<image>",
 
63
  vision_aspect_ratio= "anyres",
64
  **kwargs,
65
  ):
66
  self.num_image_tokens = num_image_tokens
67
  self.vision_feature_select_strategy = vision_feature_select_strategy
68
  self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
 
69
  self.image_token_id = (
70
  tokenizer.image_token_id
71
  if getattr(tokenizer, "image_token_id", None)
72
  else tokenizer.convert_tokens_to_ids(self.image_token)
73
  )
 
 
 
 
 
74
  self.vision_aspect_ratio = vision_aspect_ratio
75
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
76
 
77
  def __call__(
78
  self,
79
  images: ImageInput = None,
80
  text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
81
  audio=None,
 
82
  **kwargs: Unpack[RProcessorKwargs],
83
  ) -> BatchFeature:
84
  output_kwargs = self._merge_kwargs(
 
92
  elif not isinstance(text, list) and not isinstance(text[0], str):
93
  raise ValueError("Invalid input text. Please provide a string, or a list of strings")
94
 
95
+ image_inputs = {}
96
 
97
  if images is not None:
98
  image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
 
107
  text, image_sizes, height, width, self.image_token, batch_num_images
108
  )
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
111
 
112
  text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
113
  self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
114
 
115
 
116
+ return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
117
 
118
  def _expand_image_tokens(
119
  self,