update tokenizers

Files changed (6) hide show

config.json +3 -3
configuration_internlm.py → configuration_internlm2.py +18 -26
modeling_internlm2.py +189 -69
tokenization_internlm.py → tokenization_internlm2.py +6 -10
tokenization_internlm2_fast.py +214 -0
tokenizer_config.json +63 -14

config.json CHANGED Viewed

@@ -3,7 +3,7 @@
     "InternLM2ForCausalLM"
   ],
   "auto_map": {
-    "AutoConfig": "configuration_internlm.InternLMConfig",
     "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM",
     "AutoModel": "modeling_internlm2.InternLM2ForCausalLM"
   },
@@ -15,14 +15,14 @@
   "initializer_range": 0.02,
   "intermediate_size": 8192,
   "max_position_embeddings": 32768,
-  "model_type": "internlm",
   "num_attention_heads": 16,
   "num_hidden_layers": 24,
   "num_key_value_heads": 8,
   "pad_token_id": 2,
   "rms_norm_eps": 1e-05,
   "rope_scaling": {
-    "factor": 1.0,
     "type": "dynamic"
   },
   "rope_theta": 1000000,

     "InternLM2ForCausalLM"
   ],
   "auto_map": {
+    "AutoConfig": "configuration_internlm2.InternLM2Config",
     "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM",
     "AutoModel": "modeling_internlm2.InternLM2ForCausalLM"
   },
   "initializer_range": 0.02,
   "intermediate_size": 8192,
   "max_position_embeddings": 32768,
+  "model_type": "internlm2",
   "num_attention_heads": 16,
   "num_hidden_layers": 24,
   "num_key_value_heads": 8,
   "pad_token_id": 2,
   "rms_norm_eps": 1e-05,
   "rope_scaling": {
+    "factor": 2.0,
     "type": "dynamic"
   },
   "rope_theta": 1000000,

configuration_internlm.py → configuration_internlm2.py RENAMED Viewed

@@ -1,10 +1,7 @@
 # coding=utf-8
-# Copyright (c) InternLM. All rights reserved.
 #
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,21 +14,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" InternLM model configuration"""
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
-INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
-class InternLMConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate
-    an InternLM model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the InternLM-7B.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -39,8 +37,8 @@ class InternLMConfig(PretrainedConfig):
     Args:
         vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`InternLMModel`]
         hidden_size (`int`, *optional*, defaults to 4096):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 11008):
@@ -73,19 +71,8 @@ class InternLMConfig(PretrainedConfig):
             Whether to tie weight embeddings
         Example:
-    ```python
-    >>> from transformers import InternLMModel, InternLMConfig
-    >>> # Initializing a InternLM internlm-7b style configuration
-    >>> configuration = InternLMConfig()
-    >>> # Initializing a model from the internlm-7b style configuration
-    >>> model = InternLMModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "internlm"
     _auto_class = "AutoConfig"
     def __init__(  # pylint: disable=W0102
@@ -108,6 +95,7 @@ class InternLMConfig(PretrainedConfig):
         bias=True,
         rope_theta=10000,
         rope_scaling=None,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -129,6 +117,10 @@ class InternLMConfig(PretrainedConfig):
         self.rope_theta = rope_theta
         self.rope_scaling = rope_scaling
         self._rope_scaling_validation()
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,

 # coding=utf-8
+# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
 #
+# This code is based on transformers/src/transformers/models/llama/configuration_llama.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" InternLM2 model configuration"""
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
+INTERNLM2_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+# Modified from transformers.model.llama.configuration_llama.LlamaConfig
+class InternLM2Config(PretrainedConfig):
     r"""
+    This is the configuration class to store the configuration of a [`InternLM2Model`]. It is used to instantiate
+    an InternLM2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the InternLM2-7B.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
         vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the InternLM2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`InternLM2Model`]
         hidden_size (`int`, *optional*, defaults to 4096):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 11008):
             Whether to tie weight embeddings
         Example:
+    """
+    model_type = "internlm2"
     _auto_class = "AutoConfig"
     def __init__(  # pylint: disable=W0102
         bias=True,
         rope_theta=10000,
         rope_scaling=None,
+        attn_implementation="eager",
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.rope_theta = rope_theta
         self.rope_scaling = rope_scaling
         self._rope_scaling_validation()
+        self.attn_implementation = attn_implementation
+        if self.attn_implementation is None:
+            self.attn_implementation = "eager"
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,

modeling_internlm2.py CHANGED Viewed

@@ -1,10 +1,6 @@
-# coding=utf-8
-# # Copyright (c) InternLM. All rights reserved.
 #
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,6 +21,7 @@ import warnings
 from typing import List, Optional, Tuple, Union
 import torch
 import torch.utils.checkpoint
 from einops import rearrange
 from torch import nn
@@ -48,12 +45,37 @@ try:
 except:  # noqa # pylint: disable=bare-except
     BaseStreamer = None
-from .configuration_internlm import InternLMConfig as InternLM2Config
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "InternLM2Config"
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(
@@ -88,6 +110,7 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
 class InternLM2RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
@@ -105,6 +128,7 @@ class InternLM2RMSNorm(nn.Module):
         return self.weight * hidden_states.to(input_dtype)
 class InternLM2RotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -141,6 +165,7 @@ class InternLM2RotaryEmbedding(nn.Module):
         )
 class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding):
     """InternLM2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
@@ -160,6 +185,7 @@ class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding):
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
 class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding):
     """InternLM2RotaryEmbedding extended with Dynamic NTK scaling.
     Credits to the Reddit users /u/bloc97 and /u/emozilla.
@@ -188,6 +214,7 @@ class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding):
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
@@ -195,12 +222,13 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    cos = cos[position_ids].unsqueeze(1)
-    sin = sin[position_ids].unsqueeze(1)
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
@@ -221,6 +249,7 @@ class InternLM2MLP(nn.Module):
         return down_proj
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
     This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
@@ -233,6 +262,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 class InternLM2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
@@ -277,14 +307,14 @@ class InternLM2Attention(nn.Module):
                     self.head_dim,
                     max_position_embeddings=self.max_position_embeddings,
                     base=self.config.rope_theta,
-                    scaling_factor=scaling_factor
                 )
             elif scaling_type == "linear":
                 self.rotary_emb = InternLM2LinearScalingRotaryEmbedding(
                     self.head_dim,
                     max_position_embeddings=self.max_position_embeddings,
                     base=self.config.rope_theta,
-                    scaling_factor=scaling_factor
                 )
             else:
                 raise ValueError("Currently we only support rotary embedding's type being 'dynamic' or 'linear'.")
@@ -381,6 +411,7 @@ class InternLM2Attention(nn.Module):
         return attn_output, attn_weights, past_key_value
 class InternLM2FlashAttention2(InternLM2Attention):
     """
     InternLM2 flash attention module. This module inherits from `InternLM2Attention` as the weights of the module stays
@@ -417,9 +448,8 @@ class InternLM2FlashAttention2(InternLM2Attention):
         qkv_states = rearrange(
             qkv_states,
             "b q (h gs d) -> b q h gs d",
-            gs=self.num_heads + 2 * self.num_key_value_heads,
             d=self.head_dim,
-            q=q_len,
         )
         query_states = qkv_states[..., : self.num_key_value_groups, :]
@@ -427,6 +457,10 @@ class InternLM2FlashAttention2(InternLM2Attention):
         key_states = qkv_states[..., -2, :]
         value_states = qkv_states[..., -1, :]
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[-2]
@@ -446,36 +480,9 @@ class InternLM2FlashAttention2(InternLM2Attention):
         key_states = key_states.transpose(1, 2)
         value_states = value_states.transpose(1, 2)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (InternLM2RMSNorm handles it correctly)
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            # Handle the case where the model is quantized
-            if hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back "
-                f"the input in {target_dtype}."
-            )
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
         attn_output = self._flash_attention_forward(
-            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
         )
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
         attn_output = self.wo(attn_output)
@@ -484,16 +491,112 @@ class InternLM2FlashAttention2(InternLM2Attention):
         return attn_output, attn_weights, past_key_value
 class InternLM2DecoderLayer(nn.Module):
     def __init__(self, config: InternLM2Config):
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.attention = (
-            InternLM2Attention(config=config)
-            if not getattr(config, "_flash_attn_2_enabled", False)
-            else InternLM2FlashAttention2(config=config)
-        )
         self.feed_forward = InternLM2MLP(config)
         self.attention_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.ffn_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -578,6 +681,7 @@ InternLM2_START_DOCSTRING = r"""
 """
 @add_start_docstrings(
     "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.",
     InternLM2_START_DOCSTRING,
@@ -588,7 +692,6 @@ class InternLM2PreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["InternLM2DecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
     def _init_weights(self, module):
         std = self.config.initializer_range
@@ -667,6 +770,7 @@ InternLM2_INPUTS_DOCSTRING = r"""
 """
 @add_start_docstrings(
     "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.",
     InternLM2_START_DOCSTRING,
@@ -685,8 +789,10 @@ class InternLM2Model(InternLM2PreTrainedModel):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.tok_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList([InternLM2DecoderLayer(config) for _ in range(config.num_hidden_layers)])
         self.norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -700,7 +806,6 @@ class InternLM2Model(InternLM2PreTrainedModel):
     def set_input_embeddings(self, value):
         self.tok_embeddings = value
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
     def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
         # create causal mask
         # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -745,6 +850,9 @@ class InternLM2Model(InternLM2PreTrainedModel):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -770,14 +878,18 @@ class InternLM2Model(InternLM2PreTrainedModel):
         if inputs_embeds is None:
             inputs_embeds = self.tok_embeddings(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
             )
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-        )
         # embed positions
         hidden_states = inputs_embeds
@@ -851,6 +963,7 @@ class InternLM2Model(InternLM2PreTrainedModel):
         )
 class InternLM2ForCausalLM(InternLM2PreTrainedModel):
     _auto_class = "AutoModelForCausalLM"
@@ -1021,14 +1134,15 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
         return reordered_past
     def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
-        prompt = ""
-        if meta_instruction:
-            prompt += f"""<s>[UNUSED_TOKEN_146]system\n{meta_instruction}[UNUSED_TOKEN_145]\n"""
         else:
-            prompt += "<s>"
         for record in history:
-            prompt += f"""[UNUSED_TOKEN_146]user\n{record[0]}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n{record[1]}[UNUSED_TOKEN_145]\n"""
-        prompt += f"""[UNUSED_TOKEN_146]user\n{query}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n"""
         return tokenizer([prompt], return_tensors="pt")
     @torch.no_grad()
@@ -1043,14 +1157,14 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
         temperature: float = 0.8,
         top_p: float = 0.8,
         meta_instruction: str = "You are an AI assistant whose name is InternLM (书生·浦语).\n"
-"- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n"
-"- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.",
         **kwargs,
     ):
         inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
         inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
         # also add end-of-assistant token in eos token id to avoid unnecessary generation
-        eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["[UNUSED_TOKEN_145]"])[0]]
         outputs = self.generate(
             **inputs,
             streamer=streamer,
@@ -1063,7 +1177,7 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
         )
         outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
         response = tokenizer.decode(outputs, skip_special_tokens=True)
-        response = response.split("[UNUSED_TOKEN_145]")[0]
         history = history + [(query, response)]
         return response, history
@@ -1101,6 +1215,7 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
                 self.query = query
                 self.history = history
                 self.response = ""
                 self.received_inputs = False
                 self.queue.put((self.response, history + [(self.query, self.response)]))
@@ -1115,11 +1230,15 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
                     self.received_inputs = True
                     return
-                token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
-                if token.strip() != "[UNUSED_TOKEN_145]":
                     self.response = self.response + token
                     history = self.history + [(self.query, self.response)]
                     self.queue.put((self.response, history))
             def end(self):
                 self.queue.put(None)
@@ -1149,6 +1268,7 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
         return consumer()
 @add_start_docstrings(
     """
     The InternLM2 Model transformer with a sequence classification head on top (linear layer).

+# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
 #
+# This code is based on transformers/src/transformers/models/llama/modeling_llama.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 from typing import List, Optional, Tuple, Union
 import torch
+import torch.nn.functional as F
 import torch.utils.checkpoint
 from einops import rearrange
 from torch import nn
 except:  # noqa # pylint: disable=bare-except
     BaseStreamer = None
+from .configuration_internlm2 import InternLM2Config
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "InternLM2Config"
+flash_attn_func, flash_attn_varlen_func = None, None
+pad_input, index_first_axis, unpad_input = None, None, None
+def _import_flash_attn():
+    global flash_attn_func, flash_attn_varlen_func
+    global pad_input, index_first_axis, unpad_input
+    try:
+        from flash_attn import flash_attn_func as _flash_attn_func, flash_attn_varlen_func as _flash_attn_varlen_func
+        from flash_attn.bert_padding import pad_input as _pad_input, index_first_axis as _index_first_axis, unpad_input as _unpad_input
+        flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func
+        pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input
+    except ImportError:
+        raise ImportError("flash_attn is not installed.")
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(
     return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->InternLM2
 class InternLM2RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
         return self.weight * hidden_states.to(input_dtype)
+# Copied from transformers.model.llama.modeling_llama.LlamaRotaryEmbedding with Llama->InternLM2
 class InternLM2RotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         )
+# Copied from transformers.model.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->InternLM2
 class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding):
     """InternLM2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+# Copied from transformers.model.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->InternLM2
 class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding):
     """InternLM2RotaryEmbedding extended with Dynamic NTK scaling.
     Credits to the Reddit users /u/bloc97 and /u/emozilla.
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+# Copied from transformers.model.llama.modeling_llama.rotate_half
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
     return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.model.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors."""
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
         return down_proj
+# Copied from transformers.model.llama.modeling_llama.repeat_kv
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
     This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+# Modified from transformers.model.llama.modeling_llama.LlamaAttention
 class InternLM2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
                     self.head_dim,
                     max_position_embeddings=self.max_position_embeddings,
                     base=self.config.rope_theta,
+                    scaling_factor=scaling_factor,
                 )
             elif scaling_type == "linear":
                 self.rotary_emb = InternLM2LinearScalingRotaryEmbedding(
                     self.head_dim,
                     max_position_embeddings=self.max_position_embeddings,
                     base=self.config.rope_theta,
+                    scaling_factor=scaling_factor,
                 )
             else:
                 raise ValueError("Currently we only support rotary embedding's type being 'dynamic' or 'linear'.")
         return attn_output, attn_weights, past_key_value
+# Modified from transformers.model.llama.modeling_llama.InternLM2FlashAttention2
 class InternLM2FlashAttention2(InternLM2Attention):
     """
     InternLM2 flash attention module. This module inherits from `InternLM2Attention` as the weights of the module stays
         qkv_states = rearrange(
             qkv_states,
             "b q (h gs d) -> b q h gs d",
+            gs=2 + self.num_key_value_groups,
             d=self.head_dim,
         )
         query_states = qkv_states[..., : self.num_key_value_groups, :]
         key_states = qkv_states[..., -2, :]
         value_states = qkv_states[..., -1, :]
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[-2]
         key_states = key_states.transpose(1, 2)
         value_states = value_states.transpose(1, 2)
         attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, attention_mask, q_len
         )
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
         attn_output = self.wo(attn_output)
         return attn_output, attn_weights, past_key_value
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        # Contains at least one padding token in the sequence
+        causal = self.is_causal and query_length != 1
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._unpad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+            )
+        return attn_output
+    def _unpad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q.to(torch.int64),
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+INTERNLM2_ATTENTION_CLASSES = {
+    "eager": InternLM2Attention,
+    "flash_attention_2": InternLM2FlashAttention2,
+}
+# Modified from transformers.model.llama.modeling_llama.LlamaDecoderLayer
 class InternLM2DecoderLayer(nn.Module):
     def __init__(self, config: InternLM2Config):
         super().__init__()
         self.hidden_size = config.hidden_size
+        self.attention = INTERNLM2_ATTENTION_CLASSES[config.attn_implementation](config=config)
         self.feed_forward = InternLM2MLP(config)
         self.attention_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.ffn_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 """
+# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->InternLM2
 @add_start_docstrings(
     "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.",
     InternLM2_START_DOCSTRING,
     supports_gradient_checkpointing = True
     _no_split_modules = ["InternLM2DecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     def _init_weights(self, module):
         std = self.config.initializer_range
 """
+# Modified from transformers.model.llama.modeling_llama.LlamaModel
 @add_start_docstrings(
     "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.",
     InternLM2_START_DOCSTRING,
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
+        self.config = config
         self.tok_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList([InternLM2DecoderLayer(config) for _ in range(config.num_hidden_layers)])
         self.norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def set_input_embeddings(self, value):
         self.tok_embeddings = value
     def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
         # create causal mask
         # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if self.config.attn_implementation == "flash_attention_2":
+            _import_flash_attn()
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         if inputs_embeds is None:
             inputs_embeds = self.tok_embeddings(input_ids)
+        if self.config.attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        else:
+            if attention_mask is None:
+                attention_mask = torch.ones(
+                    (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+                )
+            attention_mask = self._prepare_decoder_attention_mask(
+                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
             )
         # embed positions
         hidden_states = inputs_embeds
         )
+# Modified from transformers.model.llama.modeling_llama.LlamaForCausalLM
 class InternLM2ForCausalLM(InternLM2PreTrainedModel):
     _auto_class = "AutoModelForCausalLM"
         return reordered_past
     def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
+        if tokenizer.add_bos_token:
+            prompt = ""
         else:
+            prompt = tokenizer.bos_token
+        if meta_instruction:
+            prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n"""
         for record in history:
+            prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
+        prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
         return tokenizer([prompt], return_tensors="pt")
     @torch.no_grad()
         temperature: float = 0.8,
         top_p: float = 0.8,
         meta_instruction: str = "You are an AI assistant whose name is InternLM (书生·浦语).\n"
+        "- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n"
+        "- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.",
         **kwargs,
     ):
         inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
         inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
         # also add end-of-assistant token in eos token id to avoid unnecessary generation
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]]
         outputs = self.generate(
             **inputs,
             streamer=streamer,
         )
         outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
         response = tokenizer.decode(outputs, skip_special_tokens=True)
+        response = response.split("<|im_end|>")[0]
         history = history + [(query, response)]
         return response, history
                 self.query = query
                 self.history = history
                 self.response = ""
+                self.cache = []
                 self.received_inputs = False
                 self.queue.put((self.response, history + [(self.query, self.response)]))
                     self.received_inputs = True
                     return
+                self.cache.extend(value.tolist())
+                token = self.tokenizer.decode(self.cache, skip_special_tokens=True)
+                if token.strip() != "<|im_end|>":
                     self.response = self.response + token
                     history = self.history + [(self.query, self.response)]
                     self.queue.put((self.response, history))
+                    self.cache = []
+                else:
+                    self.end()
             def end(self):
                 self.queue.put(None)
         return consumer()
+# Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2
 @add_start_docstrings(
     """
     The InternLM2 Model transformer with a sequence classification head on top (linear layer).

tokenization_internlm.py → tokenization_internlm2.py RENAMED Viewed

@@ -1,10 +1,7 @@
 # coding=utf-8
-# Copyright (c) InternLM. All rights reserved.
 #
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,7 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization classes for IntermLM."""
 import os
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple
@@ -34,9 +31,10 @@ VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
 PRETRAINED_VOCAB_FILES_MAP = {}
-class InternLMTokenizer(PreTrainedTokenizer):
     """
-    Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
     Args:
         vocab_file (`str`):
@@ -79,8 +77,6 @@ class InternLMTokenizer(PreTrainedTokenizer):
             **kwargs,
         )
-        """ Initialization"""
     @property
     def no_prefix_space_tokens(self):
         if self._no_prefix_space_tokens is None:

 # coding=utf-8
+# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
 #
+# This code is based on transformers/src/transformers/models/llama/tokenization_llama.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Tokenization classes for InternLM."""
 import os
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple
 PRETRAINED_VOCAB_FILES_MAP = {}
+# Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer
+class InternLM2Tokenizer(PreTrainedTokenizer):
     """
+    Construct a InternLM2 tokenizer. Based on byte-level Byte-Pair-Encoding.
     Args:
         vocab_file (`str`):
             **kwargs,
         )
     @property
     def no_prefix_space_tokens(self):
         if self._no_prefix_space_tokens is None:

tokenization_internlm2_fast.py ADDED Viewed

	@@ -0,0 +1,214 @@

+# coding=utf-8
+# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on transformers/src/transformers/models/llama/tokenization_llama_fast.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization Fast class for InternLM."""
+import os
+from shutil import copyfile
+from typing import Any, Dict, Optional, Tuple
+from tokenizers import processors, decoders, Tokenizer, normalizers
+from tokenizers.models import BPE
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from transformers.utils import logging
+from transformers.convert_slow_tokenizer import (
+    SLOW_TO_FAST_CONVERTERS,
+    SpmConverter,
+    SentencePieceExtractor,
+)
+from .tokenization_internlm2 import InternLM2Tokenizer
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
+# Modified from transformers.convert_slow_tokenizer.LlamaConverter
+class InternLM2Converter(SpmConverter):
+    handle_byte_fallback = True
+    def vocab(self, proto):
+        vocab = [
+            ("<unk>", 0.0),
+            ("<s>", 0.0),
+            ("</s>", 0.0),
+        ]
+        vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
+        return vocab
+    def unk_id(self, proto):
+        unk_id = 0
+        return unk_id
+    def decoder(self, replacement, add_prefix_space):
+        return decoders.Sequence(
+            [
+                decoders.Replace("▁", " "),
+                decoders.ByteFallback(),
+                decoders.Fuse(),
+                decoders.Strip(content=" ", left=1),
+            ]
+        )
+    def tokenizer(self, proto):
+        model_type = proto.trainer_spec.model_type
+        vocab_scores = self.vocab(proto)
+        # special tokens
+        added_tokens = self.original_tokenizer.added_tokens_decoder
+        for i in range(len(vocab_scores)):
+            piece, score = vocab_scores[i]
+            if i in added_tokens:
+                vocab_scores[i] = (added_tokens[i].content, score)
+        if model_type == 1:
+            raise RuntimeError("InternLM2 is supposed to be a BPE model!")
+        elif model_type == 2:
+            _, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
+            bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
+            tokenizer = Tokenizer(
+                BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
+            )
+            tokenizer.add_special_tokens(
+                [ added_token for index, added_token in added_tokens.items()]
+            )
+        else:
+            raise Exception(
+                "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
+            )
+        return tokenizer
+    def normalizer(self, proto):
+        normalizers_list = []
+        if proto.normalizer_spec.add_dummy_prefix:
+            normalizers_list.append(normalizers.Prepend(prepend="▁"))
+        normalizers_list.append(normalizers.Replace(pattern=" ", content="▁"))
+        return normalizers.Sequence(normalizers_list)
+    def pre_tokenizer(self, replacement, add_prefix_space):
+        return None
+SLOW_TO_FAST_CONVERTERS["InternLM2Tokenizer"] = InternLM2Converter
+# Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
+class InternLM2TokenizerFast(PreTrainedTokenizerFast):
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = InternLM2Tokenizer
+    padding_side = "left"
+    model_input_names = ["input_ids", "attention_mask"]
+    _auto_class = "AutoTokenizer"
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="</s>",
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        decode_with_prefix_space=False,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            sp_model_kwargs=sp_model_kwargs,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            decode_with_prefix_space=decode_with_prefix_space,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+        self._add_bos_token = add_bos_token
+        self._add_eos_token = add_eos_token
+        self.update_post_processor()
+        self.vocab_file = vocab_file
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
+    def update_post_processor(self):
+        """
+        Updates the underlying post processor with the current `bos_token` and `eos_token`.
+        """
+        bos = self.bos_token
+        bos_token_id = self.bos_token_id
+        if bos is None and self.add_bos_token:
+            raise ValueError("add_bos_token = True but bos_token = None")
+        eos = self.eos_token
+        eos_token_id = self.eos_token_id
+        if eos is None and self.add_eos_token:
+            raise ValueError("add_eos_token = True but eos_token = None")
+        single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
+        special_tokens = []
+        if self.add_bos_token:
+            special_tokens.append((bos, bos_token_id))
+        if self.add_eos_token:
+            special_tokens.append((eos, eos_token_id))
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=single, pair=pair, special_tokens=special_tokens
+        )
+    @property
+    def add_eos_token(self):
+        return self._add_eos_token
+    @property
+    def add_bos_token(self):
+        return self._add_bos_token
+    @add_eos_token.setter
+    def add_eos_token(self, value):
+        self._add_eos_token = value
+        self.update_post_processor()
+    @add_bos_token.setter
+    def add_bos_token(self, value):
+        self._add_bos_token = value
+        self.update_post_processor()
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        return (out_vocab_file,)

tokenizer_config.json CHANGED Viewed

@@ -1,4 +1,17 @@
 {
   "added_tokens_decoder": {
     "0": {
       "content": "<unk>",
@@ -23,19 +36,55 @@
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
-  "auto_map": {
-    "AutoTokenizer": [
-      "tokenization_internlm.InternLMTokenizer",
-      null
-    ]
-  },
-  "bos_token": "<s>",
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "</s>",
-  "model_max_length": 1000000000000000019884624838656,
-  "pad_token": "</s>",
-  "tokenizer_class": "InternLMTokenizer",
-  "unk_token": "<unk>"
-}

 {
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_internlm2.InternLM2Tokenizer",
+      "tokenization_internlm2_fast.InternLM2TokenizerFast"
+    ]
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "tokenizer_class": "InternLM2Tokenizer",
+  "unk_token": "<unk>",
   "added_tokens_decoder": {
     "0": {
       "content": "<unk>",
       "rstrip": false,
       "single_word": false,
       "special": true
+    },
+    "92543": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "92542": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "92541": {
+      "content": "<|action_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "92540": {
+      "content": "<|action_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "92539": {
+      "content": "<|interpreter|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "92538": {
+      "content": "<|plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
     }
   },
+  "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+}