AnthonyPa57 commited on
Commit
7664361
·
verified ·
1 Parent(s): b0c6cfa

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +5 -29
  2. tokenizer.json +242 -27
  3. tokenizer_config.json +10 -10
special_tokens_map.json CHANGED
@@ -1,34 +1,10 @@
1
  {
2
- "bos_token": {
3
- "content": "[SOS]",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "[EOS]",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "mask_token": {
17
- "content": "[MASK]",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
- "pad_token": {
24
- "content": "[PAD]",
25
- "lstrip": false,
26
- "normalized": false,
27
- "rstrip": false,
28
- "single_word": false
29
- },
30
  "unk_token": {
31
- "content": "[UNK]",
32
  "lstrip": false,
33
  "normalized": false,
34
  "rstrip": false,
 
1
  {
2
+ "bos_token": "<sos>",
3
+ "eos_token": "<eos>",
4
+ "mask_token": "<mask>",
5
+ "pad_token": "<pad>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "unk_token": {
7
+ "content": "<unk>",
8
  "lstrip": false,
9
  "normalized": false,
10
  "rstrip": false,
tokenizer.json CHANGED
@@ -5,7 +5,7 @@
5
  "added_tokens": [
6
  {
7
  "id": 0,
8
- "content": "[UNK]",
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
@@ -14,7 +14,7 @@
14
  },
15
  {
16
  "id": 1,
17
- "content": "[PAD]",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
@@ -23,7 +23,7 @@
23
  },
24
  {
25
  "id": 2,
26
- "content": "[MASK]",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
@@ -32,7 +32,7 @@
32
  },
33
  {
34
  "id": 3,
35
- "content": "[SOS]",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
@@ -41,7 +41,7 @@
41
  },
42
  {
43
  "id": 4,
44
- "content": "[EOS]",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
@@ -75,28 +75,243 @@
75
  "byte_fallback": false,
76
  "ignore_merges": false,
77
  "vocab": {
78
- "[UNK]": 0,
79
- "[PAD]": 1,
80
- "[MASK]": 2,
81
- "[SOS]": 3,
82
- "[EOS]": 4,
83
- "!": 5,
84
- "I": 6,
85
- "a": 7,
86
- "c": 8,
87
- "d": 9,
88
- "e": 10,
89
- "g": 11,
90
- "h": 12,
91
- "l": 13,
92
- "m": 14,
93
- "n": 15,
94
- "o": 16,
95
- "r": 17,
96
- "v": 18,
97
- "w": 19,
98
- "": 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  },
100
- "merges": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  }
102
  }
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
8
+ "content": "<unk>",
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
 
14
  },
15
  {
16
  "id": 1,
17
+ "content": "<pad>",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
 
23
  },
24
  {
25
  "id": 2,
26
+ "content": "<mask>",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
 
32
  },
33
  {
34
  "id": 3,
35
+ "content": "<sos>",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
 
41
  },
42
  {
43
  "id": 4,
44
+ "content": "<eos>",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
 
75
  "byte_fallback": false,
76
  "ignore_merges": false,
77
  "vocab": {
78
+ "<unk>": 0,
79
+ "<pad>": 1,
80
+ "<mask>": 2,
81
+ "<sos>": 3,
82
+ "<eos>": 4,
83
+ "?": 5,
84
+ "F": 6,
85
+ "P": 7,
86
+ "W": 8,
87
+ "a": 9,
88
+ "c": 10,
89
+ "e": 11,
90
+ "f": 12,
91
+ "h": 13,
92
+ "i": 14,
93
+ "l": 15,
94
+ "m": 16,
95
+ "n": 17,
96
+ "o": 18,
97
+ "p": 19,
98
+ "r": 20,
99
+ "s": 21,
100
+ "t": 22,
101
+ "x": 23,
102
+ "▁": 24,
103
+ "he": 25,
104
+ "the": 26,
105
+ "an": 27,
106
+ "ex": 28,
107
+ "is": 29,
108
+ "me": 30,
109
+ "othe": 31,
110
+ "ome": 32,
111
+ "some": 33,
112
+ "tex": 34,
113
+ "▁some": 35,
114
+ "▁tex": 36,
115
+ "other": 37,
116
+ "▁text": 38,
117
+ "Fr": 39,
118
+ "Pa": 40,
119
+ "Wh": 41,
120
+ "al": 42,
121
+ "ap": 43,
122
+ "at": 44,
123
+ "ce": 45,
124
+ "cap": 46,
125
+ "it": 47,
126
+ "of": 48,
127
+ "ris": 49,
128
+ "▁the": 50,
129
+ "▁an": 51,
130
+ "▁is": 52,
131
+ "▁other": 53,
132
+ "▁Fr": 54,
133
+ "▁Pa": 55,
134
+ "▁Wh": 56,
135
+ "▁cap": 57,
136
+ "▁of": 58,
137
+ "ance": 59,
138
+ "ital": 60,
139
+ "▁another": 61,
140
+ "▁France": 62,
141
+ "▁Paris": 63,
142
+ "▁What": 64,
143
+ "▁capital": 65,
144
+ "▁France?": 66
145
  },
146
+ "merges": [
147
+ [
148
+ "h",
149
+ "e"
150
+ ],
151
+ [
152
+ "t",
153
+ "he"
154
+ ],
155
+ [
156
+ "a",
157
+ "n"
158
+ ],
159
+ [
160
+ "e",
161
+ "x"
162
+ ],
163
+ [
164
+ "i",
165
+ "s"
166
+ ],
167
+ [
168
+ "m",
169
+ "e"
170
+ ],
171
+ [
172
+ "o",
173
+ "the"
174
+ ],
175
+ [
176
+ "o",
177
+ "me"
178
+ ],
179
+ [
180
+ "s",
181
+ "ome"
182
+ ],
183
+ [
184
+ "t",
185
+ "ex"
186
+ ],
187
+ [
188
+ "▁",
189
+ "some"
190
+ ],
191
+ [
192
+ "▁",
193
+ "tex"
194
+ ],
195
+ [
196
+ "othe",
197
+ "r"
198
+ ],
199
+ [
200
+ "▁tex",
201
+ "t"
202
+ ],
203
+ [
204
+ "F",
205
+ "r"
206
+ ],
207
+ [
208
+ "P",
209
+ "a"
210
+ ],
211
+ [
212
+ "W",
213
+ "h"
214
+ ],
215
+ [
216
+ "a",
217
+ "l"
218
+ ],
219
+ [
220
+ "a",
221
+ "p"
222
+ ],
223
+ [
224
+ "a",
225
+ "t"
226
+ ],
227
+ [
228
+ "c",
229
+ "e"
230
+ ],
231
+ [
232
+ "c",
233
+ "ap"
234
+ ],
235
+ [
236
+ "i",
237
+ "t"
238
+ ],
239
+ [
240
+ "o",
241
+ "f"
242
+ ],
243
+ [
244
+ "r",
245
+ "is"
246
+ ],
247
+ [
248
+ "▁",
249
+ "the"
250
+ ],
251
+ [
252
+ "▁",
253
+ "an"
254
+ ],
255
+ [
256
+ "▁",
257
+ "is"
258
+ ],
259
+ [
260
+ "▁",
261
+ "other"
262
+ ],
263
+ [
264
+ "▁",
265
+ "Fr"
266
+ ],
267
+ [
268
+ "▁",
269
+ "Pa"
270
+ ],
271
+ [
272
+ "▁",
273
+ "Wh"
274
+ ],
275
+ [
276
+ "▁",
277
+ "cap"
278
+ ],
279
+ [
280
+ "▁",
281
+ "of"
282
+ ],
283
+ [
284
+ "an",
285
+ "ce"
286
+ ],
287
+ [
288
+ "it",
289
+ "al"
290
+ ],
291
+ [
292
+ "▁an",
293
+ "other"
294
+ ],
295
+ [
296
+ "▁Fr",
297
+ "ance"
298
+ ],
299
+ [
300
+ "▁Pa",
301
+ "ris"
302
+ ],
303
+ [
304
+ "▁Wh",
305
+ "at"
306
+ ],
307
+ [
308
+ "▁cap",
309
+ "ital"
310
+ ],
311
+ [
312
+ "▁France",
313
+ "?"
314
+ ]
315
+ ]
316
  }
317
  }
tokenizer_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
- "content": "[UNK]",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
@@ -9,7 +9,7 @@
9
  "special": true
10
  },
11
  "1": {
12
- "content": "[PAD]",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
@@ -17,7 +17,7 @@
17
  "special": true
18
  },
19
  "2": {
20
- "content": "[MASK]",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
@@ -25,7 +25,7 @@
25
  "special": true
26
  },
27
  "3": {
28
- "content": "[SOS]",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
@@ -33,7 +33,7 @@
33
  "special": true
34
  },
35
  "4": {
36
- "content": "[EOS]",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
@@ -41,13 +41,13 @@
41
  "special": true
42
  }
43
  },
44
- "bos_token": "[SOS]",
45
  "clean_up_tokenization_spaces": false,
46
- "eos_token": "[EOS]",
47
  "extra_special_tokens": {},
48
- "mask_token": "[MASK]",
49
  "model_max_length": 1000000000000000019884624838656,
50
- "pad_token": "[PAD]",
51
  "tokenizer_class": "PreTrainedTokenizerFast",
52
- "unk_token": "[UNK]"
53
  }
 
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
+ "content": "<unk>",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
 
9
  "special": true
10
  },
11
  "1": {
12
+ "content": "<pad>",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
 
17
  "special": true
18
  },
19
  "2": {
20
+ "content": "<mask>",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
 
25
  "special": true
26
  },
27
  "3": {
28
+ "content": "<sos>",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
 
33
  "special": true
34
  },
35
  "4": {
36
+ "content": "<eos>",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
 
41
  "special": true
42
  }
43
  },
44
+ "bos_token": "<sos>",
45
  "clean_up_tokenization_spaces": false,
46
+ "eos_token": "<eos>",
47
  "extra_special_tokens": {},
48
+ "mask_token": "<mask>",
49
  "model_max_length": 1000000000000000019884624838656,
50
+ "pad_token": "<pad>",
51
  "tokenizer_class": "PreTrainedTokenizerFast",
52
+ "unk_token": "<unk>"
53
  }