bpetokenizer upload

d15c366 verified about 1 year ago

844 Bytes

	from bpetokenizer import BPETokenizer

	tokenizer = BPETokenizer()

	tokenizer.load("sample_bpetokenizer.json", mode="json")

	encode_text = """
	<\|startoftext\|>Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.
	Hello, Universe! Another example sentence containing [SPECIAL1] and [SPECIAL2], used to ensure tokenizer's robustness.
	Greetings, Earth! Here we have [SPECIAL1] appearing once again, followed by [SPECIAL2] in the same sentence.<\|endoftext\|>"""

	print("vocab: ", tokenizer.vocab)
	print('---')
	print("merges: ", tokenizer.merges)
	print('---')
	print("special tokens: ", tokenizer.special_tokens)

	ids = tokenizer.encode(encode_text, special_tokens="all")
	print('---')
	print('Ids: ', ids)

	decode_text = tokenizer.decode(ids)
	print('---')
	print(decode_text)