YannQi commited on
Commit
9fcd58d
·
verified ·
1 Parent(s): 252d879

Delete processing_xvl.py

Browse files
Files changed (1) hide show
  1. processing_xvl.py +0 -244
processing_xvl.py DELETED
@@ -1,244 +0,0 @@
1
- # Licensed under the Apache License, Version 2.0 (the "License");
2
- # you may not use this file except in compliance with the License.
3
- # You may obtain a copy of the License at
4
- #
5
- # http://www.apache.org/licenses/LICENSE-2.0
6
- #
7
- # Unless required by applicable law or agreed to in writing, software
8
- # distributed under the License is distributed on an "AS IS" BASIS,
9
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
- # See the License for the specific language governing permissions and
11
- # limitations under the License.
12
-
13
-
14
- import math
15
- from collections.abc import Iterable
16
- from typing import Union
17
-
18
- import numpy as np
19
-
20
- from transformers.feature_extraction_utils import BatchFeature
21
- from transformers.image_processing_utils import select_best_resolution
22
- from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
23
- from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
24
- from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
25
- from transformers.utils import logging
26
-
27
-
28
- logger = logging.get_logger(__name__)
29
-
30
-
31
- class RProcessorKwargs(ProcessingKwargs, total=False):
32
- # see processing_utils.ProcessingKwargs documentation for usage.
33
- _defaults = {
34
- "text_kwargs": {
35
- "padding": False,
36
-
37
- },
38
- "image_kwargs": {},
39
- "videos_kwargs": {},
40
- }
41
-
42
-
43
- class RProcessor(ProcessorMixin):
44
- attributes = ["image_processor", "tokenizer", "video_processor"]
45
- valid_kwargs = [
46
- "chat_template",
47
- "num_image_tokens",
48
- "image_processor_type",
49
- "vision_feature_select_strategy",
50
- "image_token",
51
- "video_token",
52
- "vision_aspect_ratio",
53
- ]
54
- image_processor_class = "AutoImageProcessor"
55
- tokenizer_class = "AutoTokenizer"
56
- video_processor_class = "AutoVideoProcessor"
57
-
58
- def __init__(
59
- self,
60
- image_processor=None,
61
- tokenizer=None,
62
- video_processor=None,
63
- num_image_tokens=None,
64
- vision_feature_select_strategy=None,
65
- chat_template=None,
66
- image_token="<image>",
67
- video_token="<video>",
68
- vision_aspect_ratio= "anyres",
69
- **kwargs,
70
- ):
71
- self.num_image_tokens = num_image_tokens
72
- self.vision_feature_select_strategy = vision_feature_select_strategy
73
- self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
74
- self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
75
- self.image_token_id = (
76
- tokenizer.image_token_id
77
- if getattr(tokenizer, "image_token_id", None)
78
- else tokenizer.convert_tokens_to_ids(self.image_token)
79
- )
80
- self.video_token_id = (
81
- tokenizer.video_token_id
82
- if getattr(tokenizer, "video_token_id", None)
83
- else tokenizer.convert_tokens_to_ids(self.video_token)
84
- )
85
- self.vision_aspect_ratio = vision_aspect_ratio
86
- super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
87
-
88
- def __call__(
89
- self,
90
- images: ImageInput = None,
91
- text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
92
- audio=None,
93
- videos=None,
94
- **kwargs: Unpack[RProcessorKwargs],
95
- ) -> BatchFeature:
96
- output_kwargs = self._merge_kwargs(
97
- RProcessorKwargs,
98
- tokenizer_init_kwargs=self.tokenizer.init_kwargs,
99
- **kwargs,
100
- )
101
-
102
- if isinstance(text, str):
103
- text = [text]
104
- elif not isinstance(text, list) and not isinstance(text[0], str):
105
- raise ValueError("Invalid input text. Please provide a string, or a list of strings")
106
-
107
- image_inputs = video_inputs = {}
108
-
109
- if images is not None:
110
- image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
111
-
112
- batch_num_images = iter(image_inputs["batch_num_images"])
113
- image_sizes = iter(image_inputs["image_sizes"])
114
- height, width = get_image_size(
115
- to_numpy_array(image_inputs["pixel_values"][0][0]),
116
- channel_dim=output_kwargs["images_kwargs"].get("data_format"),
117
- )
118
- text, num_image_tokens = self._expand_image_tokens(
119
- text, image_sizes, height, width, self.image_token, batch_num_images
120
- )
121
-
122
- if videos is not None:
123
- video_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"])
124
-
125
- one_video = video_inputs.get("pixel_values_videos")[0]
126
- if isinstance(video_inputs.get("pixel_values_videos")[0], (list, tuple)):
127
- one_video = np.array(one_video)
128
- else:
129
- one_video = to_numpy_array(one_video)
130
- height, width = get_image_size(one_video[0], channel_dim=output_kwargs["images_kwargs"].get("data_format"))
131
- num_frames = one_video.shape[0] # frame dim is always after batch dim
132
- patches_height_width = int(math.sqrt(self.num_image_tokens))
133
- pooled_height_width = math.ceil(patches_height_width / 2)
134
- num_video_tokens = (num_frames * pooled_height_width * pooled_height_width) + 1 # +1 for newline token
135
- text = [sample.replace(self.video_token, self.video_token * num_video_tokens) for sample in text]
136
-
137
- return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
138
-
139
- text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
140
- self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
141
-
142
-
143
- return BatchFeature(data={**text_inputs, **image_inputs, **video_inputs}, tensor_type=return_tensors)
144
-
145
- def _expand_image_tokens(
146
- self,
147
- text: list[TextInput],
148
- image_sizes: Iterable[Union[list[int], int]],
149
- height: int,
150
- width: int,
151
- special_token: str,
152
- batch_num_images: Iterable[int],
153
- ):
154
-
155
- prompt_strings = []
156
- max_num_vision_tokens = 0
157
- for sample in text:
158
- if special_token in sample:
159
- is_multi_image = next(batch_num_images) != 1
160
- else:
161
- is_multi_image = False
162
- while special_token in sample:
163
- if is_multi_image:
164
- num_image_tokens = self.num_image_tokens + 1 # one for image_newline
165
- else:
166
- original_size = next(image_sizes)
167
- if not isinstance(original_size, (list, tuple)):
168
- # cast to list to avoid numerical precision errors when calculating unpadding
169
- original_size = original_size.tolist()
170
- orig_height, orig_width = original_size
171
- num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
172
- max_num_vision_tokens = max(max_num_vision_tokens, num_image_tokens)
173
- if self.vision_feature_select_strategy == "default":
174
- num_image_tokens -= 1
175
- sample = sample.replace(special_token, "<placeholder>" * num_image_tokens, 1)
176
- prompt_strings.append(sample)
177
- text = [sample.replace("<placeholder>", special_token) for sample in prompt_strings]
178
- return text, max_num_vision_tokens
179
-
180
- def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
181
- image_grid_pinpoints = self.image_processor.image_grid_pinpoints
182
-
183
- height_best_resolution, width_best_resolution = select_best_resolution(
184
- [orig_height, orig_width], image_grid_pinpoints
185
- )
186
- scale_height, scale_width = height_best_resolution // height, width_best_resolution // width
187
-
188
- patches_height = patches_width = int(math.sqrt(self.num_image_tokens))
189
- unpadded_features, newline_features = self._get_unpadded_features(
190
- orig_height, orig_width, patches_height, patches_width, scale_height, scale_width
191
- )
192
-
193
- # The base patch covers the entire image (no CLS for SigLIP)
194
- base_features = self.num_image_tokens
195
- num_image_tokens = unpadded_features + newline_features + base_features
196
- return num_image_tokens
197
-
198
- # Adapted from transformers.models.llava_next.processing_llava_next.LlavaNextProcessor._get_unpadded_features
199
- def _get_unpadded_features(self, height, width, patches_height, patches_width, scale_height, scale_width):
200
- current_height = patches_height * scale_height
201
- current_width = patches_width * scale_width
202
-
203
- original_aspect_ratio = width / height
204
- current_aspect_ratio = current_width / current_height
205
- if original_aspect_ratio > current_aspect_ratio:
206
- new_height = int(round(height * (current_width / width), 7))
207
- padding = (current_height - new_height) // 2
208
- current_height -= padding * 2
209
- else:
210
- new_width = int(round(width * (current_height / height), 7))
211
- padding = (current_width - new_width) // 2
212
- current_width -= padding * 2
213
-
214
- unpadded_features = current_height * current_width
215
- newline_features = current_height
216
-
217
- return (unpadded_features, newline_features)
218
-
219
-
220
- # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
221
- def batch_decode(self, *args, **kwargs):
222
- """
223
- This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
224
- refer to the docstring of this method for more information.
225
- """
226
- return self.tokenizer.batch_decode(*args, **kwargs)
227
-
228
- # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
229
- def decode(self, *args, **kwargs):
230
- """
231
- This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
232
- the docstring of this method for more information.
233
- """
234
- return self.tokenizer.decode(*args, **kwargs)
235
-
236
- @property
237
- # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
238
- def model_input_names(self):
239
- tokenizer_input_names = self.tokenizer.model_input_names
240
- image_processor_input_names = self.image_processor.model_input_names
241
- return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
242
-
243
-
244
- __all__ = ["RProcessor"]