Update tokenizer_script.py
Browse files- tokenizer_script.py +1 -2
tokenizer_script.py
CHANGED
@@ -76,7 +76,7 @@ class CharacterTokenizer(PreTrainedTokenizer):
|
|
76 |
|
77 |
return (vocab_file,)
|
78 |
|
79 |
-
def batch_encode(self, texts, add_special_tokens=False, padding=False, max_length=None):
|
80 |
encoded_texts = [self.encode(text, add_special_tokens) for text in texts]
|
81 |
# Handle max_length (truncation)
|
82 |
if max_length is not None:
|
@@ -123,7 +123,6 @@ class CharacterTokenizer(PreTrainedTokenizer):
|
|
123 |
def convert_tokens_to_string(self, tokens):
|
124 |
return "".join(tokens)
|
125 |
|
126 |
-
|
127 |
@classmethod
|
128 |
def from_json(cls, vocab_file, **kwargs):
|
129 |
with open(vocab_file, 'r', encoding='utf-8') as f:
|
|
|
76 |
|
77 |
return (vocab_file,)
|
78 |
|
79 |
+
def batch_encode(self, texts, add_special_tokens=False, padding=False, truncation=True, max_length=None):
|
80 |
encoded_texts = [self.encode(text, add_special_tokens) for text in texts]
|
81 |
# Handle max_length (truncation)
|
82 |
if max_length is not None:
|
|
|
123 |
def convert_tokens_to_string(self, tokens):
|
124 |
return "".join(tokens)
|
125 |
|
|
|
126 |
@classmethod
|
127 |
def from_json(cls, vocab_file, **kwargs):
|
128 |
with open(vocab_file, 'r', encoding='utf-8') as f:
|