Update gptx_tokenizer.py
Browse files- gptx_tokenizer.py +9 -1
gptx_tokenizer.py
CHANGED
|
@@ -233,6 +233,7 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
|
|
| 233 |
token_ids: Union[List[int], List[List[int]]],
|
| 234 |
num_threads: Optional[int] = None,
|
| 235 |
skip_special_tokens: bool = False,
|
|
|
|
| 236 |
) -> str:
|
| 237 |
"""
|
| 238 |
Decode a list of token IDs into a string.
|
|
@@ -244,7 +245,14 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
|
|
| 244 |
"""
|
| 245 |
output = self.tok.decode(input=token_ids, num_threads=num_threads)
|
| 246 |
if skip_special_tokens:
|
| 247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
return output
|
| 249 |
|
| 250 |
def _convert_id_to_token(self, index: int) -> str:
|
|
|
|
| 233 |
token_ids: Union[List[int], List[List[int]]],
|
| 234 |
num_threads: Optional[int] = None,
|
| 235 |
skip_special_tokens: bool = False,
|
| 236 |
+
clean_up_tokenization_spaces: bool = False,
|
| 237 |
) -> str:
|
| 238 |
"""
|
| 239 |
Decode a list of token IDs into a string.
|
|
|
|
| 245 |
"""
|
| 246 |
output = self.tok.decode(input=token_ids, num_threads=num_threads)
|
| 247 |
if skip_special_tokens:
|
| 248 |
+
output = [token for token in output if token not in self.additional_special_tokens]
|
| 249 |
+
if clean_up_tokenization_spaces:
|
| 250 |
+
warnings.warn(
|
| 251 |
+
"when cleaning up tokenization spaces, this will not behave "
|
| 252 |
+
"like the original `GPTXTokenizer`., Please supply "
|
| 253 |
+
"`clean_up_tokenization_spaces=False` for decoding."
|
| 254 |
+
)
|
| 255 |
+
output = self.clean_up_tokenization(output)
|
| 256 |
return output
|
| 257 |
|
| 258 |
def _convert_id_to_token(self, index: int) -> str:
|