AnthonyPa57 commited on
Commit
306078e
·
verified ·
1 Parent(s): 34ca956

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +1 -0
  2. tokenizer.json +129 -74
  3. tokenizer_config.json +10 -1
special_tokens_map.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "eos_token": "<eos>",
3
  "mask_token": "<mask>",
4
  "pad_token": "<pad>",
 
1
  {
2
+ "bos_token": "<sos>",
3
  "eos_token": "<eos>",
4
  "mask_token": "<mask>",
5
  "pad_token": "<pad>",
tokenizer.json CHANGED
@@ -74,6 +74,15 @@
74
  "rstrip": false,
75
  "normalized": false,
76
  "special": true
 
 
 
 
 
 
 
 
 
77
  }
78
  ],
79
  "normalizer": {
@@ -110,61 +119,71 @@
110
  "<|system|>": 5,
111
  "<|assistant|>": 6,
112
  "<|user|>": 7,
113
- "\n": 8,
114
- "?": 9,
115
- "F": 10,
116
- "W": 11,
117
- "a": 12,
118
- "c": 13,
119
- "e": 14,
120
- "f": 15,
121
- "h": 16,
122
- "i": 17,
123
- "l": 18,
124
- "m": 19,
125
- "n": 20,
126
- "o": 21,
127
- "p": 22,
128
- "r": 23,
129
- "s": 24,
130
- "t": 25,
131
- "x": 26,
132
- "": 27,
133
- "he": 28,
134
- "the": 29,
135
- "▁o": 30,
136
- "an": 31,
137
- "ex": 32,
138
- "me": 33,
139
  "ome": 34,
140
  "some": 35,
141
  "tex": 36,
142
  "▁tex": 37,
143
- "ther": 38,
144
  "▁text": 39,
145
- "Fr": 40,
146
- "Wh": 41,
147
- "al": 42,
148
- "ap": 43,
149
- "at": 44,
150
- "ce": 45,
151
- "cap": 46,
152
- "is": 47,
153
- "it": 48,
154
- "▁the": 49,
155
- "▁some": 50,
156
- "▁Fr": 51,
157
- "▁Wh": 52,
158
- "▁cap": 53,
159
- "▁is": 54,
160
- "▁of": 55,
161
- "▁other": 56,
162
- "ance": 57,
163
- "ital": 58,
164
- "▁France": 59,
165
- "▁What": 60,
166
- "▁capital": 61,
167
- "▁France?": 62
 
 
 
 
 
 
 
 
 
 
168
  },
169
  "merges": [
170
  [
@@ -175,14 +194,6 @@
175
  "t",
176
  "he"
177
  ],
178
- [
179
- "▁",
180
- "o"
181
- ],
182
- [
183
- "a",
184
- "n"
185
- ],
186
  [
187
  "e",
188
  "x"
@@ -191,6 +202,10 @@
191
  "m",
192
  "e"
193
  ],
 
 
 
 
194
  [
195
  "o",
196
  "me"
@@ -208,13 +223,29 @@
208
  "tex"
209
  ],
210
  [
211
- "the",
212
  "r"
213
  ],
214
  [
215
  "▁tex",
216
  "t"
217
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  [
219
  "F",
220
  "r"
@@ -252,20 +283,20 @@
252
  "t"
253
  ],
254
  [
255
- "",
256
- "the"
257
  ],
258
  [
259
  "▁",
260
- "some"
261
  ],
262
  [
263
  "▁",
264
- "Fr"
265
  ],
266
  [
267
  "▁",
268
- "Wh"
269
  ],
270
  [
271
  "▁",
@@ -276,28 +307,28 @@
276
  "is"
277
  ],
278
  [
279
- "▁o",
280
- "f"
281
- ],
282
- [
283
- "▁o",
284
- "ther"
285
  ],
286
  [
287
  "an",
288
  "ce"
289
  ],
 
 
 
 
290
  [
291
  "it",
292
  "al"
293
  ],
294
  [
295
- "▁Fr",
296
- "ance"
297
  ],
298
  [
299
- "▁Wh",
300
- "at"
301
  ],
302
  [
303
  "▁cap",
@@ -306,6 +337,30 @@
306
  [
307
  "▁France",
308
  "?"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  ]
310
  ]
311
  }
 
74
  "rstrip": false,
75
  "normalized": false,
76
  "special": true
77
+ },
78
+ {
79
+ "id": 8,
80
+ "content": "<cls>",
81
+ "single_word": false,
82
+ "lstrip": false,
83
+ "rstrip": false,
84
+ "normalized": false,
85
+ "special": true
86
  }
87
  ],
88
  "normalizer": {
 
119
  "<|system|>": 5,
120
  "<|assistant|>": 6,
121
  "<|user|>": 7,
122
+ "<cls>": 8,
123
+ "\n": 9,
124
+ "?": 10,
125
+ "F": 11,
126
+ "W": 12,
127
+ "a": 13,
128
+ "c": 14,
129
+ "e": 15,
130
+ "f": 16,
131
+ "h": 17,
132
+ "i": 18,
133
+ "l": 19,
134
+ "m": 20,
135
+ "n": 21,
136
+ "o": 22,
137
+ "p": 23,
138
+ "r": 24,
139
+ "s": 25,
140
+ "t": 26,
141
+ "x": 27,
142
+ "": 28,
143
+ "he": 29,
144
+ "the": 30,
145
+ "ex": 31,
146
+ "me": 32,
147
+ "othe": 33,
148
  "ome": 34,
149
  "some": 35,
150
  "tex": 36,
151
  "▁tex": 37,
152
+ "other": 38,
153
  "▁text": 39,
154
+ "an": 40,
155
+ "\nsome": 41,
156
+ "▁text\n": 42,
157
+ "▁other": 43,
158
+ "Fr": 44,
159
+ "Wh": 45,
160
+ "al": 46,
161
+ "ap": 47,
162
+ "at": 48,
163
+ "ce": 49,
164
+ "cap": 50,
165
+ "is": 51,
166
+ "it": 52,
167
+ "of": 53,
168
+ "▁the": 54,
169
+ "▁an": 55,
170
+ "▁Fr": 56,
171
+ "▁cap": 57,
172
+ "▁is": 58,
173
+ "▁of": 59,
174
+ "ance": 60,
175
+ "What": 61,
176
+ "ital": 62,
177
+ "▁another": 63,
178
+ "▁France": 64,
179
+ "▁capital": 65,
180
+ "▁France?": 66,
181
+ "▁some": 67,
182
+ "▁France?\nsome": 68,
183
+ "▁What": 69,
184
+ "▁text\n\nsome": 70,
185
+ "▁text\n\n": 71,
186
+ "▁text\nWhat": 72
187
  },
188
  "merges": [
189
  [
 
194
  "t",
195
  "he"
196
  ],
 
 
 
 
 
 
 
 
197
  [
198
  "e",
199
  "x"
 
202
  "m",
203
  "e"
204
  ],
205
+ [
206
+ "o",
207
+ "the"
208
+ ],
209
  [
210
  "o",
211
  "me"
 
223
  "tex"
224
  ],
225
  [
226
+ "othe",
227
  "r"
228
  ],
229
  [
230
  "▁tex",
231
  "t"
232
  ],
233
+ [
234
+ "a",
235
+ "n"
236
+ ],
237
+ [
238
+ "\n",
239
+ "some"
240
+ ],
241
+ [
242
+ "▁text",
243
+ "\n"
244
+ ],
245
+ [
246
+ "▁",
247
+ "other"
248
+ ],
249
  [
250
  "F",
251
  "r"
 
283
  "t"
284
  ],
285
  [
286
+ "o",
287
+ "f"
288
  ],
289
  [
290
  "▁",
291
+ "the"
292
  ],
293
  [
294
  "▁",
295
+ "an"
296
  ],
297
  [
298
  "▁",
299
+ "Fr"
300
  ],
301
  [
302
  "▁",
 
307
  "is"
308
  ],
309
  [
310
+ "▁",
311
+ "of"
 
 
 
 
312
  ],
313
  [
314
  "an",
315
  "ce"
316
  ],
317
+ [
318
+ "Wh",
319
+ "at"
320
+ ],
321
  [
322
  "it",
323
  "al"
324
  ],
325
  [
326
+ "▁an",
327
+ "other"
328
  ],
329
  [
330
+ "▁Fr",
331
+ "ance"
332
  ],
333
  [
334
  "▁cap",
 
337
  [
338
  "▁France",
339
  "?"
340
+ ],
341
+ [
342
+ "▁",
343
+ "some"
344
+ ],
345
+ [
346
+ "▁France?",
347
+ "\nsome"
348
+ ],
349
+ [
350
+ "▁",
351
+ "What"
352
+ ],
353
+ [
354
+ "▁text\n",
355
+ "\nsome"
356
+ ],
357
+ [
358
+ "▁text\n",
359
+ "\n"
360
+ ],
361
+ [
362
+ "▁text\n",
363
+ "What"
364
  ]
365
  ]
366
  }
tokenizer_config.json CHANGED
@@ -63,16 +63,25 @@
63
  "rstrip": false,
64
  "single_word": false,
65
  "special": true
 
 
 
 
 
 
 
 
66
  }
67
  },
68
  "assistant_token": "<|assistant|>",
 
 
69
  "clean_up_tokenization_spaces": false,
70
  "eos_token": "<eos>",
71
  "extra_special_tokens": {},
72
  "mask_token": "<mask>",
73
  "model_max_length": 1000000000000000019884624838656,
74
  "pad_token": "<pad>",
75
- "sos_token": "<sos>",
76
  "system_token": "<|system|>",
77
  "tokenizer_class": "PreTrainedTokenizerFast",
78
  "unk_token": "<unk>",
 
63
  "rstrip": false,
64
  "single_word": false,
65
  "special": true
66
+ },
67
+ "8": {
68
+ "content": "<cls>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
  }
75
  },
76
  "assistant_token": "<|assistant|>",
77
+ "bos_token": "<sos>",
78
+ "class_token": "<cls>",
79
  "clean_up_tokenization_spaces": false,
80
  "eos_token": "<eos>",
81
  "extra_special_tokens": {},
82
  "mask_token": "<mask>",
83
  "model_max_length": 1000000000000000019884624838656,
84
  "pad_token": "<pad>",
 
85
  "system_token": "<|system|>",
86
  "tokenizer_class": "PreTrainedTokenizerFast",
87
  "unk_token": "<unk>",