Update special tokens (#3)

- Update special tokens (4deffb0b0a29b37b72f6e9d1f58f209a033ef382)
- add chatml template (bdd4b6c0d4e531762b9b6a0fa6f50c459edd23fa)
- add bos in chat template (0da1df6f7a23940bc6bd400a3e0f6631b316526d)
- update chat template in model (b2357579bad20cdebc86db80aebaea11844084f1)

Files changed (2) hide show

modeling_internlm2.py +6 -6
tokenizer_config.json +77 -2

modeling_internlm2.py CHANGED Viewed

@@ -1138,12 +1138,12 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
     def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
         prompt = ""
         if meta_instruction:
-            prompt += f"""<s>[UNUSED_TOKEN_146]system\n{meta_instruction}[UNUSED_TOKEN_145]\n"""
         else:
             prompt += "<s>"
         for record in history:
-            prompt += f"""[UNUSED_TOKEN_146]user\n{record[0]}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n{record[1]}[UNUSED_TOKEN_145]\n"""
-        prompt += f"""[UNUSED_TOKEN_146]user\n{query}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n"""
         return tokenizer([prompt], return_tensors="pt")
     @torch.no_grad()
@@ -1165,7 +1165,7 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
         inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
         inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
         # also add end-of-assistant token in eos token id to avoid unnecessary generation
-        eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["[UNUSED_TOKEN_145]"])[0]]
         outputs = self.generate(
             **inputs,
             streamer=streamer,
@@ -1178,7 +1178,7 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
         )
         outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
         response = tokenizer.decode(outputs, skip_special_tokens=True)
-        response = response.split("[UNUSED_TOKEN_145]")[0]
         history = history + [(query, response)]
         return response, history
@@ -1231,7 +1231,7 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
                     return
                 token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
-                if token.strip() != "[UNUSED_TOKEN_145]":
                     self.response = self.response + token
                     history = self.history + [(self.query, self.response)]
                     self.queue.put((self.response, history))

     def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
         prompt = ""
         if meta_instruction:
+            prompt += f"""<s><|im_start|>system\n{meta_instruction}<|im_end|>\n"""
         else:
             prompt += "<s>"
         for record in history:
+            prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
+        prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
         return tokenizer([prompt], return_tensors="pt")
     @torch.no_grad()
         inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
         inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
         # also add end-of-assistant token in eos token id to avoid unnecessary generation
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]]
         outputs = self.generate(
             **inputs,
             streamer=streamer,
         )
         outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
         response = tokenizer.decode(outputs, skip_special_tokens=True)
+        response = response.split("<|im_end|>")[0]
         history = history + [(query, response)]
         return response, history
                     return
                 token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
+                if token.strip() != "<|im_end|>":
                     self.response = self.response + token
                     history = self.history + [(self.query, self.response)]
                     self.queue.put((self.response, history))

tokenizer_config.json CHANGED Viewed

@@ -11,5 +11,80 @@
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "</s>",
   "tokenizer_class": "InternLMTokenizer",
-  "unk_token": "<unk>"
-}

   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "</s>",
   "tokenizer_class": "InternLMTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "92543": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "92542": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "92541": {
+      "content": "<|action_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "92540": {
+      "content": "<|action_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "92539": {
+      "content": "<|interpreter|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "92538": {
+      "content": "<|plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+}