Update special tokens (#3)
Browse files- Update special tokens (4deffb0b0a29b37b72f6e9d1f58f209a033ef382)
- add chatml template (bdd4b6c0d4e531762b9b6a0fa6f50c459edd23fa)
- add bos in chat template (0da1df6f7a23940bc6bd400a3e0f6631b316526d)
- update chat template in model (b2357579bad20cdebc86db80aebaea11844084f1)
- modeling_internlm2.py +6 -6
- tokenizer_config.json +77 -2
modeling_internlm2.py
CHANGED
|
@@ -1138,12 +1138,12 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
|
|
| 1138 |
def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
|
| 1139 |
prompt = ""
|
| 1140 |
if meta_instruction:
|
| 1141 |
-
prompt += f"""<s
|
| 1142 |
else:
|
| 1143 |
prompt += "<s>"
|
| 1144 |
for record in history:
|
| 1145 |
-
prompt += f"""
|
| 1146 |
-
prompt += f"""
|
| 1147 |
return tokenizer([prompt], return_tensors="pt")
|
| 1148 |
|
| 1149 |
@torch.no_grad()
|
|
@@ -1165,7 +1165,7 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
|
|
| 1165 |
inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
|
| 1166 |
inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
|
| 1167 |
# also add end-of-assistant token in eos token id to avoid unnecessary generation
|
| 1168 |
-
eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["
|
| 1169 |
outputs = self.generate(
|
| 1170 |
**inputs,
|
| 1171 |
streamer=streamer,
|
|
@@ -1178,7 +1178,7 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
|
|
| 1178 |
)
|
| 1179 |
outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
|
| 1180 |
response = tokenizer.decode(outputs, skip_special_tokens=True)
|
| 1181 |
-
response = response.split("
|
| 1182 |
history = history + [(query, response)]
|
| 1183 |
return response, history
|
| 1184 |
|
|
@@ -1231,7 +1231,7 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
|
|
| 1231 |
return
|
| 1232 |
|
| 1233 |
token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
|
| 1234 |
-
if token.strip() != "
|
| 1235 |
self.response = self.response + token
|
| 1236 |
history = self.history + [(self.query, self.response)]
|
| 1237 |
self.queue.put((self.response, history))
|
|
|
|
| 1138 |
def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
|
| 1139 |
prompt = ""
|
| 1140 |
if meta_instruction:
|
| 1141 |
+
prompt += f"""<s><|im_start|>system\n{meta_instruction}<|im_end|>\n"""
|
| 1142 |
else:
|
| 1143 |
prompt += "<s>"
|
| 1144 |
for record in history:
|
| 1145 |
+
prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
|
| 1146 |
+
prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
|
| 1147 |
return tokenizer([prompt], return_tensors="pt")
|
| 1148 |
|
| 1149 |
@torch.no_grad()
|
|
|
|
| 1165 |
inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
|
| 1166 |
inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
|
| 1167 |
# also add end-of-assistant token in eos token id to avoid unnecessary generation
|
| 1168 |
+
eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]]
|
| 1169 |
outputs = self.generate(
|
| 1170 |
**inputs,
|
| 1171 |
streamer=streamer,
|
|
|
|
| 1178 |
)
|
| 1179 |
outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
|
| 1180 |
response = tokenizer.decode(outputs, skip_special_tokens=True)
|
| 1181 |
+
response = response.split("<|im_end|>")[0]
|
| 1182 |
history = history + [(query, response)]
|
| 1183 |
return response, history
|
| 1184 |
|
|
|
|
| 1231 |
return
|
| 1232 |
|
| 1233 |
token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
|
| 1234 |
+
if token.strip() != "<|im_end|>":
|
| 1235 |
self.response = self.response + token
|
| 1236 |
history = self.history + [(self.query, self.response)]
|
| 1237 |
self.queue.put((self.response, history))
|
tokenizer_config.json
CHANGED
|
@@ -11,5 +11,80 @@
|
|
| 11 |
"model_max_length": 1000000000000000019884624838656,
|
| 12 |
"pad_token": "</s>",
|
| 13 |
"tokenizer_class": "InternLMTokenizer",
|
| 14 |
-
"unk_token": "<unk>"
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
"model_max_length": 1000000000000000019884624838656,
|
| 12 |
"pad_token": "</s>",
|
| 13 |
"tokenizer_class": "InternLMTokenizer",
|
| 14 |
+
"unk_token": "<unk>",
|
| 15 |
+
"added_tokens_decoder": {
|
| 16 |
+
"0": {
|
| 17 |
+
"content": "<unk>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false,
|
| 22 |
+
"special": true
|
| 23 |
+
},
|
| 24 |
+
"1": {
|
| 25 |
+
"content": "<s>",
|
| 26 |
+
"lstrip": false,
|
| 27 |
+
"normalized": false,
|
| 28 |
+
"rstrip": false,
|
| 29 |
+
"single_word": false,
|
| 30 |
+
"special": true
|
| 31 |
+
},
|
| 32 |
+
"2": {
|
| 33 |
+
"content": "</s>",
|
| 34 |
+
"lstrip": false,
|
| 35 |
+
"normalized": false,
|
| 36 |
+
"rstrip": false,
|
| 37 |
+
"single_word": false,
|
| 38 |
+
"special": true
|
| 39 |
+
},
|
| 40 |
+
"92543": {
|
| 41 |
+
"content": "<|im_start|>",
|
| 42 |
+
"lstrip": false,
|
| 43 |
+
"normalized": false,
|
| 44 |
+
"rstrip": false,
|
| 45 |
+
"single_word": false,
|
| 46 |
+
"special": true
|
| 47 |
+
},
|
| 48 |
+
"92542": {
|
| 49 |
+
"content": "<|im_end|>",
|
| 50 |
+
"lstrip": false,
|
| 51 |
+
"normalized": false,
|
| 52 |
+
"rstrip": false,
|
| 53 |
+
"single_word": false,
|
| 54 |
+
"special": true
|
| 55 |
+
},
|
| 56 |
+
"92541": {
|
| 57 |
+
"content": "<|action_start|>",
|
| 58 |
+
"lstrip": false,
|
| 59 |
+
"normalized": false,
|
| 60 |
+
"rstrip": false,
|
| 61 |
+
"single_word": false,
|
| 62 |
+
"special": true
|
| 63 |
+
},
|
| 64 |
+
"92540": {
|
| 65 |
+
"content": "<|action_end|>",
|
| 66 |
+
"lstrip": false,
|
| 67 |
+
"normalized": false,
|
| 68 |
+
"rstrip": false,
|
| 69 |
+
"single_word": false,
|
| 70 |
+
"special": true
|
| 71 |
+
},
|
| 72 |
+
"92539": {
|
| 73 |
+
"content": "<|interpreter|>",
|
| 74 |
+
"lstrip": false,
|
| 75 |
+
"normalized": false,
|
| 76 |
+
"rstrip": false,
|
| 77 |
+
"single_word": false,
|
| 78 |
+
"special": true
|
| 79 |
+
},
|
| 80 |
+
"92538": {
|
| 81 |
+
"content": "<|plugin|>",
|
| 82 |
+
"lstrip": false,
|
| 83 |
+
"normalized": false,
|
| 84 |
+
"rstrip": false,
|
| 85 |
+
"single_word": false,
|
| 86 |
+
"special": true
|
| 87 |
+
}
|
| 88 |
+
},
|
| 89 |
+
"chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
|
| 90 |
+
}
|