MagedSaeed commited on
Commit
26487ca
·
verified ·
1 Parent(s): 99c923e

Update tokenizer_script.py

Browse files
Files changed (1) hide show
  1. tokenizer_script.py +1 -2
tokenizer_script.py CHANGED
@@ -76,7 +76,7 @@ class CharacterTokenizer(PreTrainedTokenizer):
76
 
77
  return (vocab_file,)
78
 
79
- def batch_encode(self, texts, add_special_tokens=False, padding=False, max_length=None):
80
  encoded_texts = [self.encode(text, add_special_tokens) for text in texts]
81
  # Handle max_length (truncation)
82
  if max_length is not None:
@@ -123,7 +123,6 @@ class CharacterTokenizer(PreTrainedTokenizer):
123
  def convert_tokens_to_string(self, tokens):
124
  return "".join(tokens)
125
 
126
-
127
  @classmethod
128
  def from_json(cls, vocab_file, **kwargs):
129
  with open(vocab_file, 'r', encoding='utf-8') as f:
 
76
 
77
  return (vocab_file,)
78
 
79
+ def batch_encode(self, texts, add_special_tokens=False, padding=False, truncation=True, max_length=None):
80
  encoded_texts = [self.encode(text, add_special_tokens) for text in texts]
81
  # Handle max_length (truncation)
82
  if max_length is not None:
 
123
  def convert_tokens_to_string(self, tokens):
124
  return "".join(tokens)
125
 
 
126
  @classmethod
127
  def from_json(cls, vocab_file, **kwargs):
128
  with open(vocab_file, 'r', encoding='utf-8') as f: