AnthonyPa57 commited on
Commit
34ca956
·
verified ·
1 Parent(s): 0945294

Upload tokenizer

Browse files
Files changed (1) hide show
  1. tokenizer.json +147 -43
tokenizer.json CHANGED
@@ -113,38 +113,58 @@
113
  "\n": 8,
114
  "?": 9,
115
  "F": 10,
116
- "P": 11,
117
- "W": 12,
118
- "a": 13,
119
- "c": 14,
120
- "e": 15,
121
- "f": 16,
122
- "h": 17,
123
- "i": 18,
124
- "l": 19,
125
- "m": 20,
126
- "n": 21,
127
- "o": 22,
128
- "p": 23,
129
- "r": 24,
130
- "s": 25,
131
- "t": 26,
132
- "x": 27,
133
- "": 28,
134
- "he": 29,
135
- "the": 30,
136
  "an": 31,
137
  "ex": 32,
138
- "is": 33,
139
- "me": 34,
140
- "othe": 35,
141
- "ome": 36,
142
- "some": 37,
143
- "tex": 38,
144
- "▁some": 39,
145
- "▁tex": 40,
146
- "other": 41,
147
- "▁text": 42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  },
149
  "merges": [
150
  [
@@ -155,6 +175,10 @@
155
  "t",
156
  "he"
157
  ],
 
 
 
 
158
  [
159
  "a",
160
  "n"
@@ -163,18 +187,10 @@
163
  "e",
164
  "x"
165
  ],
166
- [
167
- "i",
168
- "s"
169
- ],
170
  [
171
  "m",
172
  "e"
173
  ],
174
- [
175
- "o",
176
- "the"
177
- ],
178
  [
179
  "o",
180
  "me"
@@ -187,21 +203,109 @@
187
  "t",
188
  "ex"
189
  ],
190
- [
191
- "▁",
192
- "some"
193
- ],
194
  [
195
  "▁",
196
  "tex"
197
  ],
198
  [
199
- "othe",
200
  "r"
201
  ],
202
  [
203
  "▁tex",
204
  "t"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  ]
206
  ]
207
  }
 
113
  "\n": 8,
114
  "?": 9,
115
  "F": 10,
116
+ "W": 11,
117
+ "a": 12,
118
+ "c": 13,
119
+ "e": 14,
120
+ "f": 15,
121
+ "h": 16,
122
+ "i": 17,
123
+ "l": 18,
124
+ "m": 19,
125
+ "n": 20,
126
+ "o": 21,
127
+ "p": 22,
128
+ "r": 23,
129
+ "s": 24,
130
+ "t": 25,
131
+ "x": 26,
132
+ "": 27,
133
+ "he": 28,
134
+ "the": 29,
135
+ "▁o": 30,
136
  "an": 31,
137
  "ex": 32,
138
+ "me": 33,
139
+ "ome": 34,
140
+ "some": 35,
141
+ "tex": 36,
142
+ "▁tex": 37,
143
+ "ther": 38,
144
+ "▁text": 39,
145
+ "Fr": 40,
146
+ "Wh": 41,
147
+ "al": 42,
148
+ "ap": 43,
149
+ "at": 44,
150
+ "ce": 45,
151
+ "cap": 46,
152
+ "is": 47,
153
+ "it": 48,
154
+ "▁the": 49,
155
+ "▁some": 50,
156
+ "▁Fr": 51,
157
+ "▁Wh": 52,
158
+ "▁cap": 53,
159
+ "▁is": 54,
160
+ "▁of": 55,
161
+ "▁other": 56,
162
+ "ance": 57,
163
+ "ital": 58,
164
+ "▁France": 59,
165
+ "▁What": 60,
166
+ "▁capital": 61,
167
+ "▁France?": 62
168
  },
169
  "merges": [
170
  [
 
175
  "t",
176
  "he"
177
  ],
178
+ [
179
+ "▁",
180
+ "o"
181
+ ],
182
  [
183
  "a",
184
  "n"
 
187
  "e",
188
  "x"
189
  ],
 
 
 
 
190
  [
191
  "m",
192
  "e"
193
  ],
 
 
 
 
194
  [
195
  "o",
196
  "me"
 
203
  "t",
204
  "ex"
205
  ],
 
 
 
 
206
  [
207
  "▁",
208
  "tex"
209
  ],
210
  [
211
+ "the",
212
  "r"
213
  ],
214
  [
215
  "▁tex",
216
  "t"
217
+ ],
218
+ [
219
+ "F",
220
+ "r"
221
+ ],
222
+ [
223
+ "W",
224
+ "h"
225
+ ],
226
+ [
227
+ "a",
228
+ "l"
229
+ ],
230
+ [
231
+ "a",
232
+ "p"
233
+ ],
234
+ [
235
+ "a",
236
+ "t"
237
+ ],
238
+ [
239
+ "c",
240
+ "e"
241
+ ],
242
+ [
243
+ "c",
244
+ "ap"
245
+ ],
246
+ [
247
+ "i",
248
+ "s"
249
+ ],
250
+ [
251
+ "i",
252
+ "t"
253
+ ],
254
+ [
255
+ "▁",
256
+ "the"
257
+ ],
258
+ [
259
+ "▁",
260
+ "some"
261
+ ],
262
+ [
263
+ "▁",
264
+ "Fr"
265
+ ],
266
+ [
267
+ "▁",
268
+ "Wh"
269
+ ],
270
+ [
271
+ "▁",
272
+ "cap"
273
+ ],
274
+ [
275
+ "▁",
276
+ "is"
277
+ ],
278
+ [
279
+ "▁o",
280
+ "f"
281
+ ],
282
+ [
283
+ "▁o",
284
+ "ther"
285
+ ],
286
+ [
287
+ "an",
288
+ "ce"
289
+ ],
290
+ [
291
+ "it",
292
+ "al"
293
+ ],
294
+ [
295
+ "▁Fr",
296
+ "ance"
297
+ ],
298
+ [
299
+ "▁Wh",
300
+ "at"
301
+ ],
302
+ [
303
+ "▁cap",
304
+ "ital"
305
+ ],
306
+ [
307
+ "▁France",
308
+ "?"
309
  ]
310
  ]
311
  }