|
from bpetokenizer import BPETokenizer
|
|
|
|
tokenizer = BPETokenizer()
|
|
|
|
tokenizer.load("sample_bpetokenizer.json", mode="json")
|
|
|
|
encode_text = """
|
|
<|startoftext|>Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.
|
|
Hello, Universe! Another example sentence containing [SPECIAL1] and [SPECIAL2], used to ensure tokenizer's robustness.
|
|
Greetings, Earth! Here we have [SPECIAL1] appearing once again, followed by [SPECIAL2] in the same sentence.<|endoftext|>"""
|
|
|
|
print("vocab: ", tokenizer.vocab)
|
|
print('---')
|
|
print("merges: ", tokenizer.merges)
|
|
print('---')
|
|
print("special tokens: ", tokenizer.special_tokens)
|
|
|
|
ids = tokenizer.encode(encode_text, special_tokens="all")
|
|
print('---')
|
|
print('Ids: ', ids)
|
|
|
|
decode_text = tokenizer.decode(ids)
|
|
print('---')
|
|
print(decode_text)
|
|
|