fast tokenizer and stream_chat fix (#5)

Browse files

- fast tokenizer and stream_chat fix (2335a07e76b814b60618eb65f7010f2603d057a7)
- keep internlm2 only (7b2becefb0b4001364bbc77e33efa7a47716a4dd)
- rename config to internlm2 (dfa1f4d7f2709d3d228f61e4651eef5940fd8863)

Co-authored-by: Shuhao Xing <[email protected]>

Files changed (6) hide show

config.json +2 -2
configuration_internlm.py → configuration_internlm2.py +13 -26
modeling_internlm2.py +12 -6
tokenization_internlm.py → tokenization_internlm2.py +6 -10
tokenization_internlm2_fast.py +214 -0
tokenizer_config.json +3 -3

config.json CHANGED Viewed

@@ -3,7 +3,7 @@
     "InternLM2ForCausalLM"
   ],
   "auto_map": {
-    "AutoConfig": "configuration_internlm.InternLMConfig",
     "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM",
     "AutoModel": "modeling_internlm2.InternLM2ForCausalLM"
   },
@@ -15,7 +15,7 @@
   "initializer_range": 0.02,
   "intermediate_size": 14336,
   "max_position_embeddings": 32768,
-  "model_type": "internlm",
   "num_attention_heads": 32,
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,

     "InternLM2ForCausalLM"
   ],
   "auto_map": {
+    "AutoConfig": "configuration_internlm2.InternLM2Config",
     "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM",
     "AutoModel": "modeling_internlm2.InternLM2ForCausalLM"
   },
   "initializer_range": 0.02,
   "intermediate_size": 14336,
   "max_position_embeddings": 32768,
+  "model_type": "internlm2",
   "num_attention_heads": 32,
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,

configuration_internlm.py → configuration_internlm2.py RENAMED Viewed

@@ -1,10 +1,7 @@
 # coding=utf-8
-# Copyright (c) InternLM. All rights reserved.
 #
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,21 +14,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" InternLM model configuration"""
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
-INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
-class InternLMConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate
-    an InternLM model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the InternLM-7B.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -39,8 +37,8 @@ class InternLMConfig(PretrainedConfig):
     Args:
         vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`InternLMModel`]
         hidden_size (`int`, *optional*, defaults to 4096):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 11008):
@@ -73,19 +71,8 @@ class InternLMConfig(PretrainedConfig):
             Whether to tie weight embeddings
         Example:
-    ```python
-    >>> from transformers import InternLMModel, InternLMConfig
-    >>> # Initializing a InternLM internlm-7b style configuration
-    >>> configuration = InternLMConfig()
-    >>> # Initializing a model from the internlm-7b style configuration
-    >>> model = InternLMModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "internlm"
     _auto_class = "AutoConfig"
     def __init__(  # pylint: disable=W0102

 # coding=utf-8
+# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
 #
+# This code is based on transformers/src/transformers/models/llama/configuration_llama.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" InternLM2 model configuration"""
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
+INTERNLM2_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+# Modified from transformers.model.llama.configuration_llama.LlamaConfig
+class InternLM2Config(PretrainedConfig):
     r"""
+    This is the configuration class to store the configuration of a [`InternLM2Model`]. It is used to instantiate
+    an InternLM2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the InternLM2-7B.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
         vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the InternLM2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`InternLM2Model`]
         hidden_size (`int`, *optional*, defaults to 4096):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 11008):
             Whether to tie weight embeddings
         Example:
+    """
+    model_type = "internlm2"
     _auto_class = "AutoConfig"
     def __init__(  # pylint: disable=W0102

modeling_internlm2.py CHANGED Viewed

@@ -45,7 +45,7 @@ try:
 except:  # noqa # pylint: disable=bare-except
     BaseStreamer = None
-from .configuration_internlm import InternLMConfig as InternLM2Config
 logger = logging.get_logger(__name__)
@@ -1134,11 +1134,12 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
         return reordered_past
     def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
-        prompt = ""
-        if meta_instruction:
-            prompt += f"""<s><|im_start|>system\n{meta_instruction}<|im_end|>\n"""
         else:
-            prompt += "<s>"
         for record in history:
             prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
         prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
@@ -1214,6 +1215,7 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
                 self.query = query
                 self.history = history
                 self.response = ""
                 self.received_inputs = False
                 self.queue.put((self.response, history + [(self.query, self.response)]))
@@ -1228,11 +1230,15 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
                     self.received_inputs = True
                     return
-                token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
                 if token.strip() != "<|im_end|>":
                     self.response = self.response + token
                     history = self.history + [(self.query, self.response)]
                     self.queue.put((self.response, history))
             def end(self):
                 self.queue.put(None)

 except:  # noqa # pylint: disable=bare-except
     BaseStreamer = None
+from .configuration_internlm2 import InternLM2Config
 logger = logging.get_logger(__name__)
         return reordered_past
     def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
+        if tokenizer.add_bos_token:
+            prompt = ""
         else:
+            prompt = tokenizer.bos_token
+        if meta_instruction:
+            prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n"""
         for record in history:
             prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
         prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
                 self.query = query
                 self.history = history
                 self.response = ""
+                self.chat = []
                 self.received_inputs = False
                 self.queue.put((self.response, history + [(self.query, self.response)]))
                     self.received_inputs = True
                     return
+                self.cache.extend(value.tolist())
+                token = self.tokenizer.decode(self.cache, skip_special_tokens=True)
                 if token.strip() != "<|im_end|>":
                     self.response = self.response + token
                     history = self.history + [(self.query, self.response)]
                     self.queue.put((self.response, history))
+                    self.cache = []
+                else:
+                    self.end()
             def end(self):
                 self.queue.put(None)

tokenization_internlm.py → tokenization_internlm2.py RENAMED Viewed

@@ -1,10 +1,7 @@
 # coding=utf-8
-# Copyright (c) InternLM. All rights reserved.
 #
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,7 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization classes for IntermLM."""
 import os
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple
@@ -34,9 +31,10 @@ VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
 PRETRAINED_VOCAB_FILES_MAP = {}
-class InternLMTokenizer(PreTrainedTokenizer):
     """
-    Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
     Args:
         vocab_file (`str`):
@@ -79,8 +77,6 @@ class InternLMTokenizer(PreTrainedTokenizer):
             **kwargs,
         )
-        """ Initialization"""
     @property
     def no_prefix_space_tokens(self):
         if self._no_prefix_space_tokens is None:

 # coding=utf-8
+# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
 #
+# This code is based on transformers/src/transformers/models/llama/tokenization_llama.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Tokenization classes for InternLM."""
 import os
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple
 PRETRAINED_VOCAB_FILES_MAP = {}
+# Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer
+class InternLM2Tokenizer(PreTrainedTokenizer):
     """
+    Construct a InternLM2 tokenizer. Based on byte-level Byte-Pair-Encoding.
     Args:
         vocab_file (`str`):
             **kwargs,
         )
     @property
     def no_prefix_space_tokens(self):
         if self._no_prefix_space_tokens is None:

tokenization_internlm2_fast.py ADDED Viewed

	@@ -0,0 +1,214 @@

+# coding=utf-8
+# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on transformers/src/transformers/models/llama/tokenization_llama_fast.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization Fast class for InternLM."""
+import os
+from shutil import copyfile
+from typing import Any, Dict, Optional, Tuple
+from tokenizers import processors, decoders, Tokenizer, normalizers
+from tokenizers.models import BPE
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from transformers.utils import logging
+from transformers.convert_slow_tokenizer import (
+    SLOW_TO_FAST_CONVERTERS,
+    SpmConverter,
+    SentencePieceExtractor,
+)
+from .tokenization_internlm2 import InternLM2Tokenizer
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
+# Modified from transformers.convert_slow_tokenizer.LlamaConverter
+class InternLM2Converter(SpmConverter):
+    handle_byte_fallback = True
+    def vocab(self, proto):
+        vocab = [
+            ("<unk>", 0.0),
+            ("<s>", 0.0),
+            ("</s>", 0.0),
+        ]
+        vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
+        return vocab
+    def unk_id(self, proto):
+        unk_id = 0
+        return unk_id
+    def decoder(self, replacement, add_prefix_space):
+        return decoders.Sequence(
+            [
+                decoders.Replace("▁", " "),
+                decoders.ByteFallback(),
+                decoders.Fuse(),
+                decoders.Strip(content=" ", left=1),
+            ]
+        )
+    def tokenizer(self, proto):
+        model_type = proto.trainer_spec.model_type
+        vocab_scores = self.vocab(proto)
+        # special tokens
+        added_tokens = self.original_tokenizer.added_tokens_decoder
+        for i in range(len(vocab_scores)):
+            piece, score = vocab_scores[i]
+            if i in added_tokens:
+                vocab_scores[i] = (added_tokens[i].content, score)
+        if model_type == 1:
+            raise RuntimeError("InternLM2 is supposed to be a BPE model!")
+        elif model_type == 2:
+            _, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
+            bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
+            tokenizer = Tokenizer(
+                BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
+            )
+            tokenizer.add_special_tokens(
+                [ added_token for index, added_token in added_tokens.items()]
+            )
+        else:
+            raise Exception(
+                "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
+            )
+        return tokenizer
+    def normalizer(self, proto):
+        normalizers_list = []
+        if proto.normalizer_spec.add_dummy_prefix:
+            normalizers_list.append(normalizers.Prepend(prepend="▁"))
+        normalizers_list.append(normalizers.Replace(pattern=" ", content="▁"))
+        return normalizers.Sequence(normalizers_list)
+    def pre_tokenizer(self, replacement, add_prefix_space):
+        return None
+SLOW_TO_FAST_CONVERTERS["InternLM2Tokenizer"] = InternLM2Converter
+# Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
+class InternLM2TokenizerFast(PreTrainedTokenizerFast):
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = InternLM2Tokenizer
+    padding_side = "left"
+    model_input_names = ["input_ids", "attention_mask"]
+    _auto_class = "AutoTokenizer"
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="</s>",
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        decode_with_prefix_space=False,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            sp_model_kwargs=sp_model_kwargs,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            decode_with_prefix_space=decode_with_prefix_space,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+        self._add_bos_token = add_bos_token
+        self._add_eos_token = add_eos_token
+        self.update_post_processor()
+        self.vocab_file = vocab_file
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
+    def update_post_processor(self):
+        """
+        Updates the underlying post processor with the current `bos_token` and `eos_token`.
+        """
+        bos = self.bos_token
+        bos_token_id = self.bos_token_id
+        if bos is None and self.add_bos_token:
+            raise ValueError("add_bos_token = True but bos_token = None")
+        eos = self.eos_token
+        eos_token_id = self.eos_token_id
+        if eos is None and self.add_eos_token:
+            raise ValueError("add_eos_token = True but eos_token = None")
+        single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
+        special_tokens = []
+        if self.add_bos_token:
+            special_tokens.append((bos, bos_token_id))
+        if self.add_eos_token:
+            special_tokens.append((eos, eos_token_id))
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=single, pair=pair, special_tokens=special_tokens
+        )
+    @property
+    def add_eos_token(self):
+        return self._add_eos_token
+    @property
+    def add_bos_token(self):
+        return self._add_bos_token
+    @add_eos_token.setter
+    def add_eos_token(self, value):
+        self._add_eos_token = value
+        self.update_post_processor()
+    @add_bos_token.setter
+    def add_bos_token(self, value):
+        self._add_bos_token = value
+        self.update_post_processor()
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        return (out_vocab_file,)

tokenizer_config.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "auto_map": {
     "AutoTokenizer": [
-      "tokenization_internlm.InternLMTokenizer",
-      null
     ]
   },
   "bos_token": "<s>",
@@ -10,7 +10,7 @@
   "eos_token": "</s>",
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "</s>",
-  "tokenizer_class": "InternLMTokenizer",
   "unk_token": "<unk>",
   "added_tokens_decoder": {
     "0": {

 {
   "auto_map": {
     "AutoTokenizer": [
+      "tokenization_internlm2.InternLM2Tokenizer",
+      "tokenization_internlm2_fast.InternLM2TokenizerFast"
     ]
   },
   "bos_token": "<s>",
   "eos_token": "</s>",
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "</s>",
+  "tokenizer_class": "InternLM2Tokenizer",
   "unk_token": "<unk>",
   "added_tokens_decoder": {
     "0": {