MagedSaeed commited on
Commit
99c923e
·
verified ·
1 Parent(s): c3ceda6

Update tokenizer_script.py

Browse files
Files changed (1) hide show
  1. tokenizer_script.py +12 -0
tokenizer_script.py CHANGED
@@ -76,6 +76,18 @@ class CharacterTokenizer(PreTrainedTokenizer):
76
 
77
  return (vocab_file,)
78
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  def train(self, texts):
80
  # Start with special tokens
81
  vocab = {}
 
76
 
77
  return (vocab_file,)
78
 
79
+ def batch_encode(self, texts, add_special_tokens=False, padding=False, max_length=None):
80
+ encoded_texts = [self.encode(text, add_special_tokens) for text in texts]
81
+ # Handle max_length (truncation)
82
+ if max_length is not None:
83
+ encoded_texts = [ids[:max_length] for ids in encoded_texts]
84
+ # Handle padding
85
+ if padding:
86
+ pad_id = self.vocab.get(self.pad_token, 0)
87
+ max_len = max(len(ids) for ids in encoded_texts) if max_length is None else max_length
88
+ encoded_texts = [ids + [pad_id] * (max_len - len(ids)) for ids in encoded_texts]
89
+ return encoded_texts
90
+
91
  def train(self, texts):
92
  # Start with special tokens
93
  vocab = {}