Update tiktoken_tokenizer.py
Browse files- tiktoken_tokenizer.py +10 -13
tiktoken_tokenizer.py
CHANGED
|
@@ -53,24 +53,21 @@ class BaseTokenizer(PreTrainedTokenizer):
|
|
| 53 |
return NotImplemented
|
| 54 |
|
| 55 |
class TikTokenizer(BaseTokenizer):
|
| 56 |
-
|
| 57 |
-
def from_pretrained(path, *inputs, **kwargs):
|
| 58 |
-
return TikTokenizer(vocab_file=os.path.join(path, "tokenizer.tiktoken"))
|
| 59 |
|
| 60 |
-
def __init__(self, vocab_file
|
| 61 |
pat_str = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
| 62 |
self.pat_str = re.compile(pat_str)
|
| 63 |
|
| 64 |
self.b64_vocab = {}
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
self.b64_vocab['%s' % token] = rank
|
| 74 |
|
| 75 |
self.special_tokens = ["<|endoftext|>", "[MASK]", "[gMASK]", "[sMASK]", "<sop>", "<eop>", "<|system|>",
|
| 76 |
"<|user|>", "<|assistant|>", "<|observation|>"]
|
|
|
|
| 53 |
return NotImplemented
|
| 54 |
|
| 55 |
class TikTokenizer(BaseTokenizer):
|
| 56 |
+
vocab_files_names = {"vocab_file": "tokenizer.tiktoken"}
|
|
|
|
|
|
|
| 57 |
|
| 58 |
+
def __init__(self, vocab_file, **kwargs):
|
| 59 |
pat_str = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
| 60 |
self.pat_str = re.compile(pat_str)
|
| 61 |
|
| 62 |
self.b64_vocab = {}
|
| 63 |
+
mergeable_ranks = {}
|
| 64 |
+
with open(vocab_file) as f:
|
| 65 |
+
for line in f:
|
| 66 |
+
token, rank = line.strip().split()
|
| 67 |
+
rank = int(rank)
|
| 68 |
+
token = base64.b64decode(token)
|
| 69 |
+
mergeable_ranks[token] = rank
|
| 70 |
+
self.b64_vocab['%s' % token] = rank
|
|
|
|
| 71 |
|
| 72 |
self.special_tokens = ["<|endoftext|>", "[MASK]", "[gMASK]", "[sMASK]", "<sop>", "<eop>", "<|system|>",
|
| 73 |
"<|user|>", "<|assistant|>", "<|observation|>"]
|