Upload tokenizer
Browse files- tokenizer.json +66 -81
tokenizer.json
CHANGED
|
@@ -142,48 +142,45 @@
|
|
| 142 |
"▁": 28,
|
| 143 |
"he": 29,
|
| 144 |
"the": 30,
|
| 145 |
-
"
|
| 146 |
-
"
|
| 147 |
-
"
|
| 148 |
-
"
|
| 149 |
-
"
|
| 150 |
-
"
|
| 151 |
-
"
|
| 152 |
-
"
|
| 153 |
-
"
|
| 154 |
-
"
|
| 155 |
-
"\
|
| 156 |
-
"▁
|
| 157 |
-
"▁
|
| 158 |
-
"
|
| 159 |
-
"
|
| 160 |
-
"
|
| 161 |
-
"
|
| 162 |
-
"
|
| 163 |
-
"
|
| 164 |
-
"
|
| 165 |
-
"
|
| 166 |
-
"
|
| 167 |
-
"
|
| 168 |
-
"
|
| 169 |
-
"
|
| 170 |
-
"
|
| 171 |
-
"▁
|
| 172 |
-
"▁
|
| 173 |
-
"▁
|
| 174 |
-
"
|
| 175 |
-
"
|
| 176 |
-
"
|
| 177 |
-
"▁
|
| 178 |
-
"
|
| 179 |
-
"
|
| 180 |
-
"▁France
|
| 181 |
-
"▁
|
| 182 |
-
"▁
|
| 183 |
-
"▁
|
| 184 |
-
"▁text\n\nsome": 70,
|
| 185 |
-
"▁text\n\n": 71,
|
| 186 |
-
"▁text\nWhat": 72
|
| 187 |
},
|
| 188 |
"merges": [
|
| 189 |
[
|
|
@@ -194,6 +191,10 @@
|
|
| 194 |
"t",
|
| 195 |
"he"
|
| 196 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
[
|
| 198 |
"e",
|
| 199 |
"x"
|
|
@@ -231,21 +232,29 @@
|
|
| 231 |
"t"
|
| 232 |
],
|
| 233 |
[
|
| 234 |
-
"
|
| 235 |
-
"n"
|
| 236 |
],
|
| 237 |
[
|
| 238 |
-
"
|
| 239 |
-
"
|
| 240 |
],
|
| 241 |
[
|
| 242 |
"▁text",
|
| 243 |
-
"\n"
|
| 244 |
],
|
| 245 |
[
|
| 246 |
-
"▁",
|
| 247 |
"other"
|
| 248 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
[
|
| 250 |
"F",
|
| 251 |
"r"
|
|
@@ -292,12 +301,16 @@
|
|
| 292 |
],
|
| 293 |
[
|
| 294 |
"▁",
|
| 295 |
-
"
|
| 296 |
],
|
| 297 |
[
|
| 298 |
"▁",
|
| 299 |
"Fr"
|
| 300 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
[
|
| 302 |
"▁",
|
| 303 |
"cap"
|
|
@@ -314,22 +327,18 @@
|
|
| 314 |
"an",
|
| 315 |
"ce"
|
| 316 |
],
|
| 317 |
-
[
|
| 318 |
-
"Wh",
|
| 319 |
-
"at"
|
| 320 |
-
],
|
| 321 |
[
|
| 322 |
"it",
|
| 323 |
"al"
|
| 324 |
],
|
| 325 |
-
[
|
| 326 |
-
"▁an",
|
| 327 |
-
"other"
|
| 328 |
-
],
|
| 329 |
[
|
| 330 |
"▁Fr",
|
| 331 |
"ance"
|
| 332 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
[
|
| 334 |
"▁cap",
|
| 335 |
"ital"
|
|
@@ -337,30 +346,6 @@
|
|
| 337 |
[
|
| 338 |
"▁France",
|
| 339 |
"?"
|
| 340 |
-
],
|
| 341 |
-
[
|
| 342 |
-
"▁",
|
| 343 |
-
"some"
|
| 344 |
-
],
|
| 345 |
-
[
|
| 346 |
-
"▁France?",
|
| 347 |
-
"\nsome"
|
| 348 |
-
],
|
| 349 |
-
[
|
| 350 |
-
"▁",
|
| 351 |
-
"What"
|
| 352 |
-
],
|
| 353 |
-
[
|
| 354 |
-
"▁text\n",
|
| 355 |
-
"\nsome"
|
| 356 |
-
],
|
| 357 |
-
[
|
| 358 |
-
"▁text\n",
|
| 359 |
-
"\n"
|
| 360 |
-
],
|
| 361 |
-
[
|
| 362 |
-
"▁text\n",
|
| 363 |
-
"What"
|
| 364 |
]
|
| 365 |
]
|
| 366 |
}
|
|
|
|
| 142 |
"▁": 28,
|
| 143 |
"he": 29,
|
| 144 |
"the": 30,
|
| 145 |
+
"an": 31,
|
| 146 |
+
"ex": 32,
|
| 147 |
+
"me": 33,
|
| 148 |
+
"othe": 34,
|
| 149 |
+
"ome": 35,
|
| 150 |
+
"some": 36,
|
| 151 |
+
"tex": 37,
|
| 152 |
+
"▁tex": 38,
|
| 153 |
+
"other": 39,
|
| 154 |
+
"▁text": 40,
|
| 155 |
+
"\n\n": 41,
|
| 156 |
+
"▁an": 42,
|
| 157 |
+
"▁text\n\n": 43,
|
| 158 |
+
"▁another": 44,
|
| 159 |
+
"▁some": 45,
|
| 160 |
+
"▁text\n\nsome": 46,
|
| 161 |
+
"Fr": 47,
|
| 162 |
+
"Wh": 48,
|
| 163 |
+
"al": 49,
|
| 164 |
+
"ap": 50,
|
| 165 |
+
"at": 51,
|
| 166 |
+
"ce": 52,
|
| 167 |
+
"cap": 53,
|
| 168 |
+
"is": 54,
|
| 169 |
+
"it": 55,
|
| 170 |
+
"of": 56,
|
| 171 |
+
"▁the": 57,
|
| 172 |
+
"▁other": 58,
|
| 173 |
+
"▁Fr": 59,
|
| 174 |
+
"▁Wh": 60,
|
| 175 |
+
"▁cap": 61,
|
| 176 |
+
"▁is": 62,
|
| 177 |
+
"▁of": 63,
|
| 178 |
+
"ance": 64,
|
| 179 |
+
"ital": 65,
|
| 180 |
+
"▁France": 66,
|
| 181 |
+
"▁What": 67,
|
| 182 |
+
"▁capital": 68,
|
| 183 |
+
"▁France?": 69
|
|
|
|
|
|
|
|
|
|
| 184 |
},
|
| 185 |
"merges": [
|
| 186 |
[
|
|
|
|
| 191 |
"t",
|
| 192 |
"he"
|
| 193 |
],
|
| 194 |
+
[
|
| 195 |
+
"a",
|
| 196 |
+
"n"
|
| 197 |
+
],
|
| 198 |
[
|
| 199 |
"e",
|
| 200 |
"x"
|
|
|
|
| 232 |
"t"
|
| 233 |
],
|
| 234 |
[
|
| 235 |
+
"\n",
|
| 236 |
+
"\n"
|
| 237 |
],
|
| 238 |
[
|
| 239 |
+
"▁",
|
| 240 |
+
"an"
|
| 241 |
],
|
| 242 |
[
|
| 243 |
"▁text",
|
| 244 |
+
"\n\n"
|
| 245 |
],
|
| 246 |
[
|
| 247 |
+
"▁an",
|
| 248 |
"other"
|
| 249 |
],
|
| 250 |
+
[
|
| 251 |
+
"▁",
|
| 252 |
+
"some"
|
| 253 |
+
],
|
| 254 |
+
[
|
| 255 |
+
"▁text\n\n",
|
| 256 |
+
"some"
|
| 257 |
+
],
|
| 258 |
[
|
| 259 |
"F",
|
| 260 |
"r"
|
|
|
|
| 301 |
],
|
| 302 |
[
|
| 303 |
"▁",
|
| 304 |
+
"other"
|
| 305 |
],
|
| 306 |
[
|
| 307 |
"▁",
|
| 308 |
"Fr"
|
| 309 |
],
|
| 310 |
+
[
|
| 311 |
+
"▁",
|
| 312 |
+
"Wh"
|
| 313 |
+
],
|
| 314 |
[
|
| 315 |
"▁",
|
| 316 |
"cap"
|
|
|
|
| 327 |
"an",
|
| 328 |
"ce"
|
| 329 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
[
|
| 331 |
"it",
|
| 332 |
"al"
|
| 333 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
[
|
| 335 |
"▁Fr",
|
| 336 |
"ance"
|
| 337 |
],
|
| 338 |
+
[
|
| 339 |
+
"▁Wh",
|
| 340 |
+
"at"
|
| 341 |
+
],
|
| 342 |
[
|
| 343 |
"▁cap",
|
| 344 |
"ital"
|
|
|
|
| 346 |
[
|
| 347 |
"▁France",
|
| 348 |
"?"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
]
|
| 350 |
]
|
| 351 |
}
|