cannot load tokenizer

#4
by g1a5535 - opened

{
"name": "ValueError",
"message": "Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast convertors: ['AlbertTokenizer', 'BartTokenizer', 'BarthezTokenizer', 'BertTokenizer', 'BigBirdTokenizer', 'BlenderbotTokenizer', 'CamembertTokenizer', 'CLIPTokenizer', 'CodeGenTokenizer', 'ConvBertTokenizer', 'DebertaTokenizer', 'DebertaV2Tokenizer', 'DistilBertTokenizer', 'DPRReaderTokenizer', 'DPRQuestionEncoderTokenizer', 'DPRContextEncoderTokenizer', 'ElectraTokenizer', 'FNetTokenizer', 'FunnelTokenizer', 'GPT2Tokenizer', 'HerbertTokenizer', 'LayoutLMTokenizer', 'LayoutLMv2Tokenizer', 'LayoutLMv3Tokenizer', 'LayoutXLMTokenizer', 'LongformerTokenizer', 'LEDTokenizer', 'LxmertTokenizer', 'MarkupLMTokenizer', 'MBartTokenizer', 'MBart50Tokenizer', 'MPNetTokenizer', 'MobileBertTokenizer', 'MvpTokenizer', 'NllbTokenizer', 'OpenAIGPTTokenizer', 'PegasusTokenizer', 'Qwen2Tokenizer', 'RealmTokenizer', 'ReformerTokenizer', 'RemBertTokenizer', 'RetriBertTokenizer', 'RobertaTokenizer', 'RoFormerTokenizer', 'SeamlessM4TTokenizer', 'SqueezeBertTokenizer', 'T5Tokenizer', 'UdopTokenizer', 'WhisperTokenizer', 'XLMRobertaTokenizer', 'XLNetTokenizer', 'SplinterTokenizer', 'XGLMTokenizer', 'LlamaTokenizer', 'CodeLlamaTokenizer', 'GemmaTokenizer', 'Phi3Tokenizer']",
"stack": "---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
File ~/.local/lib/python3.8/site-packages/transformers/convert_slow_tokenizer.py:1633, in convert_slow_tokenizer(transformer_tokenizer, from_tiktoken)
1632 logger.info("Converting from Tiktoken")
-> 1633 return TikTokenConverter(
1634 vocab_file=transformer_tokenizer.vocab_file,
1635 additional_special_tokens=transformer_tokenizer.additional_special_tokens,
1636 ).converted()
1637 except Exception:

File ~/.local/lib/python3.8/site-packages/transformers/convert_slow_tokenizer.py:1533, in TikTokenConverter.converted(self)
1532 def converted(self) -> Tokenizer:
-> 1533 tokenizer = self.tokenizer()
1534 tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
1535 [
1536 pre_tokenizers.Split(Regex(self.pattern), behavior="isolated", invert=False),
1537 pre_tokenizers.ByteLevel(add_prefix_space=self.add_prefix_space, use_regex=False),
1538 ]
1539 )

File ~/.local/lib/python3.8/site-packages/transformers/convert_slow_tokenizer.py:1526, in TikTokenConverter.tokenizer(self)
1525 def tokenizer(self):
-> 1526 vocab_scores, merges = self.extract_vocab_merges_from_model(self.vocab_file)
1527 tokenizer = Tokenizer(BPE(vocab_scores, merges, fuse_unk=False))

File ~/.local/lib/python3.8/site-packages/transformers/convert_slow_tokenizer.py:1502, in TikTokenConverter.extract_vocab_merges_from_model(self, tiktoken_url)
1498 raise ValueError(
1499 "tiktoken is required to read a tiktoken file. Install it with " "pip install tiktoken."
1500 )
-> 1502 bpe_ranks = load_tiktoken_bpe(tiktoken_url)
1503 byte_encoder = bytes_to_unicode()

File ~/.local/lib/python3.8/site-packages/tiktoken/load.py:148, in load_tiktoken_bpe(tiktoken_bpe_file, expected_hash)
147 contents = read_file_cached(tiktoken_bpe_file, expected_hash)
--> 148 return {
149 base64.b64decode(token): int(rank)
150 for token, rank in (line.split() for line in contents.splitlines() if line)
151 }

File ~/.local/lib/python3.8/site-packages/tiktoken/load.py:150, in (.0)
147 contents = read_file_cached(tiktoken_bpe_file, expected_hash)
148 return {
149 base64.b64decode(token): int(rank)
--> 150 for token, rank in (line.split() for line in contents.splitlines() if line)
151 }

ValueError: not enough values to unpack (expected 2, got 1)

During handling of the above exception, another exception occurred:

ValueError Traceback (most recent call last)
Cell In[13], line 1
----> 1 tokenizer = AutoTokenizer.from_pretrained(model_name)
2 print("tokenizer loaded")
3 tokenizer.save_pretrained(f"tokenizer_{model_name.replace('/', '--')}")

File ~/.local/lib/python3.8/site-packages/transformers/models/auto/tokenization_auto.py:920, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
916 if tokenizer_class is None:
917 raise ValueError(
918 f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
919 )
--> 920 return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
922 # Otherwise we have to be creative.
923 # if model is an encoder decoder, the encoder tokenizer class is used by default
924 if isinstance(config, EncoderDecoderConfig):

File ~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2213, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, trust_remote_code, *init_inputs, **kwargs)
2210 else:
2211 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 2213 return cls._from_pretrained(
2214 resolved_vocab_files,
2215 pretrained_model_name_or_path,
2216 init_configuration,
2217 *init_inputs,
2218 token=token,
2219 cache_dir=cache_dir,
2220 local_files_only=local_files_only,
2221 _commit_hash=commit_hash,
2222 _is_local=is_local,
2223 trust_remote_code=trust_remote_code,
2224 **kwargs,
2225 )

File ~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2447, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, trust_remote_code, *init_inputs, **kwargs)
2445 # Instantiate the tokenizer.
2446 try:
-> 2447 tokenizer = cls(*init_inputs, **init_kwargs)
2448 except import_protobuf_decode_error():
2449 logger.info(
2450 "Unable to load tokenizer model from SPM, loading from TikToken will be attempted instead."
2451 "(Google protobuf error: Tried to load SPM model with non-SPM vocab file).",
2452 )

File ~/.local/lib/python3.8/site-packages/transformers/models/llama/tokenization_llama_fast.py:157, in LlamaTokenizerFast.init(self, vocab_file, tokenizer_file, clean_up_tokenization_spaces, unk_token, bos_token, eos_token, add_bos_token, add_eos_token, use_default_system_prompt, legacy, add_prefix_space, **kwargs)
154 if add_prefix_space is not None:
155 kwargs["from_slow"] = True
--> 157 super().init(
158 vocab_file=vocab_file,
159 tokenizer_file=tokenizer_file,
160 clean_up_tokenization_spaces=clean_up_tokenization_spaces,
161 unk_token=unk_token,
162 bos_token=bos_token,
163 eos_token=eos_token,
164 add_bos_token=add_bos_token,
165 add_eos_token=add_eos_token,
166 use_default_system_prompt=use_default_system_prompt,
167 add_prefix_space=add_prefix_space,
168 legacy=legacy,
169 **kwargs,
170 )
171 self._add_bos_token = add_bos_token
172 self._add_eos_token = add_eos_token

File ~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_fast.py:138, in PreTrainedTokenizerFast.init(self, *args, **kwargs)
136 self.vocab_file = kwargs.get("vocab_file", None)
137 self.additional_special_tokens = kwargs.get("additional_special_tokens", [])
--> 138 fast_tokenizer = convert_slow_tokenizer(self, from_tiktoken=True)
139 slow_tokenizer = None
140 else:

File ~/.local/lib/python3.8/site-packages/transformers/convert_slow_tokenizer.py:1638, in convert_slow_tokenizer(transformer_tokenizer, from_tiktoken)
1633 return TikTokenConverter(
1634 vocab_file=transformer_tokenizer.vocab_file,
1635 additional_special_tokens=transformer_tokenizer.additional_special_tokens,
1636 ).converted()
1637 except Exception:
-> 1638 raise ValueError(
1639 f"Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path "
1640 f"with a SentencePiece tokenizer.model file."
1641 f"Currently available slow->fast convertors: {list(SLOW_TO_FAST_CONVERTERS.keys())}"
1642 )

ValueError: Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast convertors: ['AlbertTokenizer', 'BartTokenizer', 'BarthezTokenizer', 'BertTokenizer', 'BigBirdTokenizer', 'BlenderbotTokenizer', 'CamembertTokenizer', 'CLIPTokenizer', 'CodeGenTokenizer', 'ConvBertTokenizer', 'DebertaTokenizer', 'DebertaV2Tokenizer', 'DistilBertTokenizer', 'DPRReaderTokenizer', 'DPRQuestionEncoderTokenizer', 'DPRContextEncoderTokenizer', 'ElectraTokenizer', 'FNetTokenizer', 'FunnelTokenizer', 'GPT2Tokenizer', 'HerbertTokenizer', 'LayoutLMTokenizer', 'LayoutLMv2Tokenizer', 'LayoutLMv3Tokenizer', 'LayoutXLMTokenizer', 'LongformerTokenizer', 'LEDTokenizer', 'LxmertTokenizer', 'MarkupLMTokenizer', 'MBartTokenizer', 'MBart50Tokenizer', 'MPNetTokenizer', 'MobileBertTokenizer', 'MvpTokenizer', 'NllbTokenizer', 'OpenAIGPTTokenizer', 'PegasusTokenizer', 'Qwen2Tokenizer', 'RealmTokenizer', 'ReformerTokenizer', 'RemBertTokenizer', 'RetriBertTokenizer', 'RobertaTokenizer', 'RoFormerTokenizer', 'SeamlessM4TTokenizer', 'SqueezeBertTokenizer', 'T5Tokenizer', 'UdopTokenizer', 'WhisperTokenizer', 'XLMRobertaTokenizer', 'XLNetTokenizer', 'SplinterTokenizer', 'XGLMTokenizer', 'LlamaTokenizer', 'CodeLlamaTokenizer', 'GemmaTokenizer', 'Phi3Tokenizer']"
}

updating transfoemers version should fix this I think

Sign up or log in to comment