hkeshhk's picture
bpetokenizer upload
d15c366 verified
raw
history blame contribute delete
844 Bytes
from bpetokenizer import BPETokenizer
tokenizer = BPETokenizer()
tokenizer.load("sample_bpetokenizer.json", mode="json")
encode_text = """
<|startoftext|>Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.
Hello, Universe! Another example sentence containing [SPECIAL1] and [SPECIAL2], used to ensure tokenizer's robustness.
Greetings, Earth! Here we have [SPECIAL1] appearing once again, followed by [SPECIAL2] in the same sentence.<|endoftext|>"""
print("vocab: ", tokenizer.vocab)
print('---')
print("merges: ", tokenizer.merges)
print('---')
print("special tokens: ", tokenizer.special_tokens)
ids = tokenizer.encode(encode_text, special_tokens="all")
print('---')
print('Ids: ', ids)
decode_text = tokenizer.decode(ids)
print('---')
print(decode_text)