openGPT-X
/

Teuken-7B-instruct-research-v0.4

Text Generation

text-generation-inference

Model card Files Files and versions

mfromm commited on Nov 27, 2024

Commit

074b5e7

·

verified ·

1 Parent(s): 4159c04

Update gptx_tokenizer.py

Files changed (1) hide show

gptx_tokenizer.py +9 -1

gptx_tokenizer.py CHANGED Viewed

@@ -233,6 +233,7 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
         token_ids: Union[List[int], List[List[int]]],
         num_threads: Optional[int] = None,
         skip_special_tokens: bool = False,
     ) -> str:
         """
         Decode a list of token IDs into a string.
@@ -244,7 +245,14 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
         """
         output = self.tok.decode(input=token_ids, num_threads=num_threads)
         if skip_special_tokens:
-            token_ids = [token for token in output if token not in self.additional_special_tokens]
         return output
     def _convert_id_to_token(self, index: int) -> str:

         token_ids: Union[List[int], List[List[int]]],
         num_threads: Optional[int] = None,
         skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = False,
     ) -> str:
         """
         Decode a list of token IDs into a string.
         """
         output = self.tok.decode(input=token_ids, num_threads=num_threads)
         if skip_special_tokens:
+            output = [token for token in output if token not in self.additional_special_tokens]
+        if clean_up_tokenization_spaces:
+            warnings.warn(
+                "when cleaning up tokenization spaces, this will not behave "
+                "like the original `GPTXTokenizer`., Please supply "
+                "`clean_up_tokenization_spaces=False` for decoding."
+            )
+            output = self.clean_up_tokenization(output)
         return output
     def _convert_id_to_token(self, index: int) -> str: