AnthonyPa57 commited on
Commit
b0c6cfa
·
verified ·
1 Parent(s): 60ef3a6

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +29 -5
  2. tokenizer.json +27 -242
  3. tokenizer_config.json +10 -10
special_tokens_map.json CHANGED
@@ -1,10 +1,34 @@
1
  {
2
- "bos_token": "<sos>",
3
- "eos_token": "<eos>",
4
- "mask_token": "<mask>",
5
- "pad_token": "<pad>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "unk_token": {
7
- "content": "<unk>",
8
  "lstrip": false,
9
  "normalized": false,
10
  "rstrip": false,
 
1
  {
2
+ "bos_token": {
3
+ "content": "[SOS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "[EOS]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "mask_token": {
17
+ "content": "[MASK]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "pad_token": {
24
+ "content": "[PAD]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
  "unk_token": {
31
+ "content": "[UNK]",
32
  "lstrip": false,
33
  "normalized": false,
34
  "rstrip": false,
tokenizer.json CHANGED
@@ -5,7 +5,7 @@
5
  "added_tokens": [
6
  {
7
  "id": 0,
8
- "content": "<unk>",
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
@@ -14,7 +14,7 @@
14
  },
15
  {
16
  "id": 1,
17
- "content": "<pad>",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
@@ -23,7 +23,7 @@
23
  },
24
  {
25
  "id": 2,
26
- "content": "<mask>",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
@@ -32,7 +32,7 @@
32
  },
33
  {
34
  "id": 3,
35
- "content": "<sos>",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
@@ -41,7 +41,7 @@
41
  },
42
  {
43
  "id": 4,
44
- "content": "<eos>",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
@@ -75,243 +75,28 @@
75
  "byte_fallback": false,
76
  "ignore_merges": false,
77
  "vocab": {
78
- "<unk>": 0,
79
- "<pad>": 1,
80
- "<mask>": 2,
81
- "<sos>": 3,
82
- "<eos>": 4,
83
- "?": 5,
84
- "F": 6,
85
- "P": 7,
86
- "W": 8,
87
- "a": 9,
88
- "c": 10,
89
- "e": 11,
90
- "f": 12,
91
- "h": 13,
92
- "i": 14,
93
- "l": 15,
94
- "m": 16,
95
- "n": 17,
96
- "o": 18,
97
- "p": 19,
98
- "r": 20,
99
- "s": 21,
100
- "t": 22,
101
- "x": 23,
102
- "▁": 24,
103
- "he": 25,
104
- "the": 26,
105
- "an": 27,
106
- "ex": 28,
107
- "is": 29,
108
- "me": 30,
109
- "othe": 31,
110
- "ome": 32,
111
- "some": 33,
112
- "tex": 34,
113
- "▁some": 35,
114
- "▁tex": 36,
115
- "other": 37,
116
- "▁text": 38,
117
- "Fr": 39,
118
- "Pa": 40,
119
- "Wh": 41,
120
- "al": 42,
121
- "ap": 43,
122
- "at": 44,
123
- "ce": 45,
124
- "cap": 46,
125
- "it": 47,
126
- "of": 48,
127
- "ris": 49,
128
- "▁the": 50,
129
- "▁an": 51,
130
- "▁is": 52,
131
- "▁other": 53,
132
- "▁Fr": 54,
133
- "▁Pa": 55,
134
- "▁Wh": 56,
135
- "▁cap": 57,
136
- "▁of": 58,
137
- "ance": 59,
138
- "ital": 60,
139
- "▁another": 61,
140
- "▁France": 62,
141
- "▁Paris": 63,
142
- "▁What": 64,
143
- "▁capital": 65,
144
- "▁France?": 66
145
  },
146
- "merges": [
147
- [
148
- "h",
149
- "e"
150
- ],
151
- [
152
- "t",
153
- "he"
154
- ],
155
- [
156
- "a",
157
- "n"
158
- ],
159
- [
160
- "e",
161
- "x"
162
- ],
163
- [
164
- "i",
165
- "s"
166
- ],
167
- [
168
- "m",
169
- "e"
170
- ],
171
- [
172
- "o",
173
- "the"
174
- ],
175
- [
176
- "o",
177
- "me"
178
- ],
179
- [
180
- "s",
181
- "ome"
182
- ],
183
- [
184
- "t",
185
- "ex"
186
- ],
187
- [
188
- "▁",
189
- "some"
190
- ],
191
- [
192
- "▁",
193
- "tex"
194
- ],
195
- [
196
- "othe",
197
- "r"
198
- ],
199
- [
200
- "▁tex",
201
- "t"
202
- ],
203
- [
204
- "F",
205
- "r"
206
- ],
207
- [
208
- "P",
209
- "a"
210
- ],
211
- [
212
- "W",
213
- "h"
214
- ],
215
- [
216
- "a",
217
- "l"
218
- ],
219
- [
220
- "a",
221
- "p"
222
- ],
223
- [
224
- "a",
225
- "t"
226
- ],
227
- [
228
- "c",
229
- "e"
230
- ],
231
- [
232
- "c",
233
- "ap"
234
- ],
235
- [
236
- "i",
237
- "t"
238
- ],
239
- [
240
- "o",
241
- "f"
242
- ],
243
- [
244
- "r",
245
- "is"
246
- ],
247
- [
248
- "▁",
249
- "the"
250
- ],
251
- [
252
- "▁",
253
- "an"
254
- ],
255
- [
256
- "▁",
257
- "is"
258
- ],
259
- [
260
- "▁",
261
- "other"
262
- ],
263
- [
264
- "▁",
265
- "Fr"
266
- ],
267
- [
268
- "▁",
269
- "Pa"
270
- ],
271
- [
272
- "▁",
273
- "Wh"
274
- ],
275
- [
276
- "▁",
277
- "cap"
278
- ],
279
- [
280
- "▁",
281
- "of"
282
- ],
283
- [
284
- "an",
285
- "ce"
286
- ],
287
- [
288
- "it",
289
- "al"
290
- ],
291
- [
292
- "▁an",
293
- "other"
294
- ],
295
- [
296
- "▁Fr",
297
- "ance"
298
- ],
299
- [
300
- "▁Pa",
301
- "ris"
302
- ],
303
- [
304
- "▁Wh",
305
- "at"
306
- ],
307
- [
308
- "▁cap",
309
- "ital"
310
- ],
311
- [
312
- "▁France",
313
- "?"
314
- ]
315
- ]
316
  }
317
  }
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
8
+ "content": "[UNK]",
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
 
14
  },
15
  {
16
  "id": 1,
17
+ "content": "[PAD]",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
 
23
  },
24
  {
25
  "id": 2,
26
+ "content": "[MASK]",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
 
32
  },
33
  {
34
  "id": 3,
35
+ "content": "[SOS]",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
 
41
  },
42
  {
43
  "id": 4,
44
+ "content": "[EOS]",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
 
75
  "byte_fallback": false,
76
  "ignore_merges": false,
77
  "vocab": {
78
+ "[UNK]": 0,
79
+ "[PAD]": 1,
80
+ "[MASK]": 2,
81
+ "[SOS]": 3,
82
+ "[EOS]": 4,
83
+ "!": 5,
84
+ "I": 6,
85
+ "a": 7,
86
+ "c": 8,
87
+ "d": 9,
88
+ "e": 10,
89
+ "g": 11,
90
+ "h": 12,
91
+ "l": 13,
92
+ "m": 14,
93
+ "n": 15,
94
+ "o": 16,
95
+ "r": 17,
96
+ "v": 18,
97
+ "w": 19,
98
+ "": 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  },
100
+ "merges": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  }
102
  }
tokenizer_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
- "content": "<unk>",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
@@ -9,7 +9,7 @@
9
  "special": true
10
  },
11
  "1": {
12
- "content": "<pad>",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
@@ -17,7 +17,7 @@
17
  "special": true
18
  },
19
  "2": {
20
- "content": "<mask>",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
@@ -25,7 +25,7 @@
25
  "special": true
26
  },
27
  "3": {
28
- "content": "<sos>",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
@@ -33,7 +33,7 @@
33
  "special": true
34
  },
35
  "4": {
36
- "content": "<eos>",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
@@ -41,13 +41,13 @@
41
  "special": true
42
  }
43
  },
44
- "bos_token": "<sos>",
45
  "clean_up_tokenization_spaces": false,
46
- "eos_token": "<eos>",
47
  "extra_special_tokens": {},
48
- "mask_token": "<mask>",
49
  "model_max_length": 1000000000000000019884624838656,
50
- "pad_token": "<pad>",
51
  "tokenizer_class": "PreTrainedTokenizerFast",
52
- "unk_token": "<unk>"
53
  }
 
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
+ "content": "[UNK]",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
 
9
  "special": true
10
  },
11
  "1": {
12
+ "content": "[PAD]",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
 
17
  "special": true
18
  },
19
  "2": {
20
+ "content": "[MASK]",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
 
25
  "special": true
26
  },
27
  "3": {
28
+ "content": "[SOS]",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
 
33
  "special": true
34
  },
35
  "4": {
36
+ "content": "[EOS]",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
 
41
  "special": true
42
  }
43
  },
44
+ "bos_token": "[SOS]",
45
  "clean_up_tokenization_spaces": false,
46
+ "eos_token": "[EOS]",
47
  "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
  "model_max_length": 1000000000000000019884624838656,
50
+ "pad_token": "[PAD]",
51
  "tokenizer_class": "PreTrainedTokenizerFast",
52
+ "unk_token": "[UNK]"
53
  }