Commit
·
2e8ba46
1
Parent(s):
e7c16cf
Upload processor
Browse files- preprocessor_config.json +3 -0
- processor.py +43 -0
- tokenizer_config.json +3 -0
preprocessor_config.json
CHANGED
|
@@ -1,4 +1,7 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
| 2 |
"crop_size": {
|
| 3 |
"height": 224,
|
| 4 |
"width": 224
|
|
|
|
| 1 |
{
|
| 2 |
+
"auto_map": {
|
| 3 |
+
"AutoProcessor": "processor.GIAProcessor"
|
| 4 |
+
},
|
| 5 |
"crop_size": {
|
| 6 |
"height": 224,
|
| 7 |
"width": 224
|
processor.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from itertools import chain
|
| 2 |
+
from transformers import GitProcessor
|
| 3 |
+
|
| 4 |
+
class GIAProcessor(GitProcessor):
|
| 5 |
+
def __init__(self, image_processor, tokenizer):
|
| 6 |
+
super().__init__(image_processor, tokenizer)
|
| 7 |
+
self._block_size = 1024
|
| 8 |
+
|
| 9 |
+
def _group_texts(self, examples):
|
| 10 |
+
# Concatenate all texts.
|
| 11 |
+
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
| 12 |
+
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
| 13 |
+
# We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
|
| 14 |
+
# We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
|
| 15 |
+
total_length = (total_length // self._block_size) * self._block_size
|
| 16 |
+
# Split by chunks of max_len.
|
| 17 |
+
result = {
|
| 18 |
+
k: [t[i: i + self._block_size] for i in range(0, total_length, self._block_size)]
|
| 19 |
+
for k, t in concatenated_examples.items()
|
| 20 |
+
}
|
| 21 |
+
return result
|
| 22 |
+
|
| 23 |
+
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
|
| 24 |
+
if text is not None and images is None:
|
| 25 |
+
encoded_text = self.tokenizer(text, return_tensors=return_tensors)
|
| 26 |
+
encoding = self._group_texts(encoded_text)
|
| 27 |
+
elif text is not None and images is not None:
|
| 28 |
+
encoding = super().__call__(text, images, return_tensors, **kwargs)
|
| 29 |
+
|
| 30 |
+
return encoding
|
| 31 |
+
|
| 32 |
+
def batch_decode(self, *args, **kwargs):
|
| 33 |
+
return self.tokenizer.batch_decode(*args, **kwargs)
|
| 34 |
+
|
| 35 |
+
def decode(self, *args, **kwargs):
|
| 36 |
+
return self.tokenizer.decode(*args, **kwargs)
|
| 37 |
+
|
| 38 |
+
@property
|
| 39 |
+
def model_input_names(self):
|
| 40 |
+
return ["input_ids", "attention_mask", "pixel_values"]
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
GIAProcessor.register_for_auto_class("AutoProcessor")
|
tokenizer_config.json
CHANGED
|
@@ -1,4 +1,7 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
| 2 |
"clean_up_tokenization_spaces": true,
|
| 3 |
"cls_token": "[CLS]",
|
| 4 |
"do_lower_case": true,
|
|
|
|
| 1 |
{
|
| 2 |
+
"auto_map": {
|
| 3 |
+
"AutoProcessor": "processor.GIAProcessor"
|
| 4 |
+
},
|
| 5 |
"clean_up_tokenization_spaces": true,
|
| 6 |
"cls_token": "[CLS]",
|
| 7 |
"do_lower_case": true,
|