File size: 844 Bytes
d15c366
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from bpetokenizer import BPETokenizer

tokenizer = BPETokenizer()

tokenizer.load("sample_bpetokenizer.json", mode="json")

encode_text = """

<|startoftext|>Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.

Hello, Universe! Another example sentence containing [SPECIAL1] and [SPECIAL2], used to ensure tokenizer's robustness.

Greetings, Earth! Here we have [SPECIAL1] appearing once again, followed by [SPECIAL2] in the same sentence.<|endoftext|>"""

print("vocab: ", tokenizer.vocab)
print('---')
print("merges: ", tokenizer.merges)
print('---')
print("special tokens: ", tokenizer.special_tokens)

ids = tokenizer.encode(encode_text, special_tokens="all")
print('---')
print('Ids: ', ids)

decode_text = tokenizer.decode(ids)
print('---')
print(decode_text)