neilmehta24 commited on
Commit
1402537
·
verified ·
1 Parent(s): 26f8a93

Delete tokenization_ernie4_5.py

Browse files
Files changed (1) hide show
  1. tokenization_ernie4_5.py +0 -214
tokenization_ernie4_5.py DELETED
@@ -1,214 +0,0 @@
1
- # Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- import os
16
- from shutil import copyfile
17
- from typing import List, Optional, Tuple
18
- import sentencepiece as spm
19
-
20
- from transformers.tokenization_utils import PreTrainedTokenizer
21
- from transformers.utils import logging
22
-
23
-
24
- logger = logging.get_logger(__name__)
25
-
26
-
27
- class Ernie4_5_Tokenizer(PreTrainedTokenizer):
28
-
29
- vocab_files_names = {
30
- "vocab_file": "tokenizer.model",
31
- }
32
- # Model input names expected by the tokenizer
33
- model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
34
- # Padding side (where to add padding tokens)
35
- padding_side = "right"
36
-
37
- def __init__(
38
- self,
39
- vocab_file,
40
- bos_token="<s>",
41
- cls_token="<cls>",
42
- eos_token="</s>",
43
- mask_token="<mask:0>",
44
- pad_token="<pad>",
45
- sep_token="<sep>",
46
- unk_token="<unk>",
47
- additional_special_tokens=None,
48
- verbose=False,
49
- **kwargs,
50
- ):
51
- """
52
- Initialize the ERNIE tokenizer.
53
-
54
- Args:
55
- vocab_file (str): Path to the SentencePiece model file.
56
- bos_token (str, optional): Beginning of sentence token. Defaults to "<s>".
57
- cls_token (str, optional): Classification token. Defaults to "<cls>".
58
- eos_token (str, optional): End of sentence token. Defaults to "</s>".
59
- mask_token (str, optional): Mask token. Defaults to "<mask:0>".
60
- pad_token (str, optional): Padding token. Defaults to "<pad>".
61
- sep_token (str, optional): Separator token. Defaults to "<sep>".
62
- unk_token (str, optional): Unknown token. Defaults to "<unk>".
63
- additional_special_tokens (List[str], optional): Additional special tokens.
64
- Defaults to ["<mask:1>", "<mask:7>"].
65
- verbose (bool, optional): Whether to print detailed logs or progress information during execution.
66
- **kwargs: Additional keyword arguments passed to the parent class.
67
- """
68
-
69
- self.vocab_file = vocab_file
70
- self.sp_model = spm.SentencePieceProcessor()
71
- self.sp_model.Load(vocab_file)
72
-
73
- if additional_special_tokens is None:
74
- additional_special_tokens = ["<mask:1>", "<mask:7>"]
75
- super().__init__(
76
- bos_token=bos_token,
77
- cls_token=cls_token,
78
- eos_token=eos_token,
79
- mask_token=mask_token,
80
- pad_token=pad_token,
81
- sep_token=sep_token,
82
- unk_token=unk_token,
83
- additional_special_tokens=additional_special_tokens,
84
- verbose=verbose,
85
- **kwargs,
86
- )
87
-
88
- @property
89
- def vocab_size(self):
90
- """Returns the size of the vocabulary.
91
-
92
- Returns:
93
- int: The number of tokens in the vocabulary.
94
- """
95
- return self.sp_model.vocab_size()
96
-
97
- def get_vocab(self):
98
- """Get the vocabulary as a dictionary mapping tokens to their IDs.
99
-
100
- Returns:
101
- dict: A dictionary mapping tokens to their corresponding IDs.
102
- """
103
- vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
104
- vocab.update(self.added_tokens_encoder)
105
- return vocab
106
-
107
- def _tokenize(self, text):
108
- """Tokenize text using SentencePiece.
109
-
110
- Args:
111
- text (str): The text to tokenize.
112
-
113
- Returns:
114
- list: A list of tokens.
115
- """
116
- return self.sp_model.encode_as_pieces(text)
117
-
118
- def _convert_token_to_id(self, token):
119
- """Convert a token (str) to an ID using the vocabulary.
120
-
121
- Args:
122
- token (str): The token to convert.
123
-
124
- Returns:
125
- int: The corresponding token ID.
126
- """
127
- return self.sp_model.piece_to_id(token)
128
-
129
- def _convert_id_to_token(self, id):
130
- """Convert an ID to a token (str) using the vocabulary.
131
-
132
- Args:
133
- id (int): The token ID to convert.
134
-
135
- Returns:
136
- str: The corresponding token.
137
- """
138
- if id >= self.vocab_size:
139
- return self.unk_token
140
- else:
141
- return self.sp_model.id_to_piece(id)
142
-
143
- def convert_tokens_to_string(self, tokens):
144
- """Convert a sequence of tokens back to a single string.
145
-
146
- Args:
147
- tokens (List[str]): A list of tokens to convert.
148
-
149
- Returns:
150
- str: The reconstructed string.
151
- """
152
- current_sub_tokens = []
153
- out_string = ""
154
- for token in tokens:
155
- # make sure that special tokens are not decoded using sentencepiece model
156
- if token in self.all_special_tokens:
157
- out_string += self.sp_model.decode(current_sub_tokens) + token
158
- current_sub_tokens = []
159
- else:
160
- current_sub_tokens.append(token)
161
- out_string += self.sp_model.decode(current_sub_tokens)
162
- return out_string
163
-
164
- def prepare_for_model(self, *args, **kwargs):
165
- if "add_special_tokens" in kwargs:
166
- kwargs.pop("add_special_tokens")
167
- return super().prepare_for_model(*args, **kwargs)
168
-
169
- def save_vocabulary(
170
- self, save_directory, filename_prefix: Optional[str] = None
171
- ) -> Tuple[str]:
172
- """
173
- Save the vocabulary and special tokens file to a directory.
174
-
175
- Args:
176
- save_directory (str): The directory in which to save the vocabulary.
177
- filename_prefix (Optional[str]): Optional prefix for the saved filename.
178
-
179
- Returns:
180
- Tuple[str]: Paths to the files saved.
181
-
182
- Raises:
183
- ValueError: If the save_directory is not a valid directory.
184
- """
185
- if not os.path.isdir(save_directory):
186
- logger.error(f"Vocabulary path ({save_directory}) should be a directory")
187
- return
188
- out_vocab_file = os.path.join(
189
- save_directory,
190
- (filename_prefix + "-" if filename_prefix else "")
191
- + self.vocab_files_names["vocab_file"],
192
- )
193
-
194
- if os.path.abspath(self.vocab_file) != os.path.abspath(
195
- out_vocab_file
196
- ) and os.path.isfile(self.vocab_file):
197
- copyfile(self.vocab_file, out_vocab_file)
198
- elif not os.path.isfile(self.vocab_file):
199
- with open(out_vocab_file, "wb") as fi:
200
- content_spiece_model = self.sp_model.serialized_model_proto()
201
- fi.write(content_spiece_model)
202
-
203
- return (out_vocab_file,)
204
-
205
- def _decode(self, *args, **kwargs):
206
- kwargs.pop("clean_up_tokenization_spaces", None)
207
- kwargs.pop("spaces_between_special_tokens", None)
208
- return super()._decode(
209
- *args,
210
- **kwargs,
211
- clean_up_tokenization_spaces=False,
212
- spaces_between_special_tokens=False,
213
- )
214
-