MagedSaeed commited on
Commit
c3ceda6
·
verified ·
1 Parent(s): 9946b07

Upload tokenizer

Browse files
Files changed (1) hide show
  1. tokenizer_script.py +18 -20
tokenizer_script.py CHANGED
@@ -12,6 +12,7 @@ class CharacterTokenizer(PreTrainedTokenizer):
12
 
13
  def __init__(
14
  self,
 
15
  unk_token="[UNK]",
16
  pad_token="[PAD]",
17
  bos_token="[BOS]",
@@ -19,19 +20,19 @@ class CharacterTokenizer(PreTrainedTokenizer):
19
  sep_token="[SEP]",
20
  **kwargs
21
  ):
22
-
23
- vocab = {}
24
- # Add special tokens
25
- special_tokens = [
26
- unk_token,
27
- pad_token,
28
- bos_token,
29
- eos_token,
30
- sep_token,
31
- ]
32
- for token in special_tokens:
33
- if token not in vocab:
34
- vocab[token] = len(vocab)
35
  self.vocab = vocab
36
  self.inv_vocab = {v: k for k, v in self.vocab.items()}
37
 
@@ -113,16 +114,13 @@ class CharacterTokenizer(PreTrainedTokenizer):
113
 
114
  @classmethod
115
  def from_json(cls, vocab_file, **kwargs):
116
- print('vocab file is:',vocab_file)
117
- with open(vocab_file, 'r', encoding='utf-8') as f:
118
- vocab = json.load(f)
119
-
120
- return cls(vocab=vocab, **kwargs)
121
 
122
  @classmethod
123
  def from_vocab(cls, vocab, **kwargs):
124
- print('vocab are:',vocab)
125
- return cls(vocab=vocab, **kwargs)
126
 
127
  @classmethod
128
  def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
 
12
 
13
  def __init__(
14
  self,
15
+ vocab=None,
16
  unk_token="[UNK]",
17
  pad_token="[PAD]",
18
  bos_token="[BOS]",
 
20
  sep_token="[SEP]",
21
  **kwargs
22
  ):
23
+ if vocab is None:
24
+ vocab = {}
25
+ # Add special tokens
26
+ special_tokens = [
27
+ unk_token,
28
+ pad_token,
29
+ bos_token,
30
+ eos_token,
31
+ sep_token,
32
+ ]
33
+ for token in special_tokens:
34
+ if token not in vocab:
35
+ vocab[token] = len(vocab)
36
  self.vocab = vocab
37
  self.inv_vocab = {v: k for k, v in self.vocab.items()}
38
 
 
114
 
115
  @classmethod
116
  def from_json(cls, vocab_file, **kwargs):
117
+ with open(vocab_file, 'r', encoding='utf-8') as f:
118
+ vocab = json.load(f)
119
+ return cls(vocab=vocab, **kwargs)
 
 
120
 
121
  @classmethod
122
  def from_vocab(cls, vocab, **kwargs):
123
+ return cls(vocab=vocab, **kwargs)
 
124
 
125
  @classmethod
126
  def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):