| import random | |
| import youtokentome as yttm | |
| bpe = yttm.BPE(model="e.model") | |
| train_data_path = "train_data.txt" | |
| model_path = "example.model" | |
| # Generating random file with training data | |
| # 10000 lines with 100 characters in each line | |
| n_lines = 10000 | |
| n_characters = 100 | |
| with open(train_data_path, "w") as fout: | |
| for _ in range(n_lines): | |
| print("".join([random.choice("abcd ") for _ in range(n_characters)]), file=fout) | |
| # Generating random text | |
| test_text = "".join([random.choice("abcde ") for _ in range(100)]) | |
| # Training model | |
| yttm.BPE.train(data=train_data_path, vocab_size=5000, model=model_path) | |
| # Loading model | |
| bpe = yttm.BPE(model=model_path) | |
| # Two types of tokenization | |
| print(bpe.encode([test_text], output_type=yttm.OutputType.ID)) | |
| print(bpe.encode([test_text], output_type=yttm.OutputType.SUBWORD)) | |