File size: 844 Bytes

d15c366

from bpetokenizer import BPETokenizer

tokenizer = BPETokenizer()

tokenizer.load("sample_bpetokenizer.json", mode="json")

encode_text = """

<|startoftext|>Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.

Hello, Universe! Another example sentence containing [SPECIAL1] and [SPECIAL2], used to ensure tokenizer's robustness.

Greetings, Earth! Here we have [SPECIAL1] appearing once again, followed by [SPECIAL2] in the same sentence.<|endoftext|>"""

print("vocab: ", tokenizer.vocab)
print('---')
print("merges: ", tokenizer.merges)
print('---')
print("special tokens: ", tokenizer.special_tokens)

ids = tokenizer.encode(encode_text, special_tokens="all")
print('---')
print('Ids: ', ids)

decode_text = tokenizer.decode(ids)
print('---')
print(decode_text)