AnthonyPa57 commited on
Commit
2562632
·
verified ·
1 Parent(s): 306078e

Upload tokenizer

Browse files
Files changed (1) hide show
  1. tokenizer.json +66 -81
tokenizer.json CHANGED
@@ -142,48 +142,45 @@
142
  "▁": 28,
143
  "he": 29,
144
  "the": 30,
145
- "ex": 31,
146
- "me": 32,
147
- "othe": 33,
148
- "ome": 34,
149
- "some": 35,
150
- "tex": 36,
151
- "tex": 37,
152
- "other": 38,
153
- "▁text": 39,
154
- "an": 40,
155
- "\nsome": 41,
156
- "▁text\n": 42,
157
- "▁other": 43,
158
- "Fr": 44,
159
- "Wh": 45,
160
- "al": 46,
161
- "ap": 47,
162
- "at": 48,
163
- "ce": 49,
164
- "cap": 50,
165
- "is": 51,
166
- "it": 52,
167
- "of": 53,
168
- "▁the": 54,
169
- "▁an": 55,
170
- "▁Fr": 56,
171
- "▁cap": 57,
172
- "▁is": 58,
173
- "▁of": 59,
174
- "ance": 60,
175
- "What": 61,
176
- "ital": 62,
177
- "▁another": 63,
178
- "▁France": 64,
179
- "▁capital": 65,
180
- "▁France?": 66,
181
- "▁some": 67,
182
- "▁France?\nsome": 68,
183
- "▁What": 69,
184
- "▁text\n\nsome": 70,
185
- "▁text\n\n": 71,
186
- "▁text\nWhat": 72
187
  },
188
  "merges": [
189
  [
@@ -194,6 +191,10 @@
194
  "t",
195
  "he"
196
  ],
 
 
 
 
197
  [
198
  "e",
199
  "x"
@@ -231,21 +232,29 @@
231
  "t"
232
  ],
233
  [
234
- "a",
235
- "n"
236
  ],
237
  [
238
- "\n",
239
- "some"
240
  ],
241
  [
242
  "▁text",
243
- "\n"
244
  ],
245
  [
246
- "▁",
247
  "other"
248
  ],
 
 
 
 
 
 
 
 
249
  [
250
  "F",
251
  "r"
@@ -292,12 +301,16 @@
292
  ],
293
  [
294
  "▁",
295
- "an"
296
  ],
297
  [
298
  "▁",
299
  "Fr"
300
  ],
 
 
 
 
301
  [
302
  "▁",
303
  "cap"
@@ -314,22 +327,18 @@
314
  "an",
315
  "ce"
316
  ],
317
- [
318
- "Wh",
319
- "at"
320
- ],
321
  [
322
  "it",
323
  "al"
324
  ],
325
- [
326
- "▁an",
327
- "other"
328
- ],
329
  [
330
  "▁Fr",
331
  "ance"
332
  ],
 
 
 
 
333
  [
334
  "▁cap",
335
  "ital"
@@ -337,30 +346,6 @@
337
  [
338
  "▁France",
339
  "?"
340
- ],
341
- [
342
- "▁",
343
- "some"
344
- ],
345
- [
346
- "▁France?",
347
- "\nsome"
348
- ],
349
- [
350
- "▁",
351
- "What"
352
- ],
353
- [
354
- "▁text\n",
355
- "\nsome"
356
- ],
357
- [
358
- "▁text\n",
359
- "\n"
360
- ],
361
- [
362
- "▁text\n",
363
- "What"
364
  ]
365
  ]
366
  }
 
142
  "▁": 28,
143
  "he": 29,
144
  "the": 30,
145
+ "an": 31,
146
+ "ex": 32,
147
+ "me": 33,
148
+ "othe": 34,
149
+ "ome": 35,
150
+ "some": 36,
151
+ "tex": 37,
152
+ "▁tex": 38,
153
+ "other": 39,
154
+ "▁text": 40,
155
+ "\n\n": 41,
156
+ "▁an": 42,
157
+ "▁text\n\n": 43,
158
+ "▁another": 44,
159
+ "▁some": 45,
160
+ "▁text\n\nsome": 46,
161
+ "Fr": 47,
162
+ "Wh": 48,
163
+ "al": 49,
164
+ "ap": 50,
165
+ "at": 51,
166
+ "ce": 52,
167
+ "cap": 53,
168
+ "is": 54,
169
+ "it": 55,
170
+ "of": 56,
171
+ "▁the": 57,
172
+ "▁other": 58,
173
+ "▁Fr": 59,
174
+ "▁Wh": 60,
175
+ "▁cap": 61,
176
+ "▁is": 62,
177
+ "▁of": 63,
178
+ "ance": 64,
179
+ "ital": 65,
180
+ "▁France": 66,
181
+ "▁What": 67,
182
+ "▁capital": 68,
183
+ "▁France?": 69
 
 
 
184
  },
185
  "merges": [
186
  [
 
191
  "t",
192
  "he"
193
  ],
194
+ [
195
+ "a",
196
+ "n"
197
+ ],
198
  [
199
  "e",
200
  "x"
 
232
  "t"
233
  ],
234
  [
235
+ "\n",
236
+ "\n"
237
  ],
238
  [
239
+ "",
240
+ "an"
241
  ],
242
  [
243
  "▁text",
244
+ "\n\n"
245
  ],
246
  [
247
+ "▁an",
248
  "other"
249
  ],
250
+ [
251
+ "▁",
252
+ "some"
253
+ ],
254
+ [
255
+ "▁text\n\n",
256
+ "some"
257
+ ],
258
  [
259
  "F",
260
  "r"
 
301
  ],
302
  [
303
  "▁",
304
+ "other"
305
  ],
306
  [
307
  "▁",
308
  "Fr"
309
  ],
310
+ [
311
+ "▁",
312
+ "Wh"
313
+ ],
314
  [
315
  "▁",
316
  "cap"
 
327
  "an",
328
  "ce"
329
  ],
 
 
 
 
330
  [
331
  "it",
332
  "al"
333
  ],
 
 
 
 
334
  [
335
  "▁Fr",
336
  "ance"
337
  ],
338
+ [
339
+ "▁Wh",
340
+ "at"
341
+ ],
342
  [
343
  "▁cap",
344
  "ital"
 
346
  [
347
  "▁France",
348
  "?"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  ]
350
  ]
351
  }