ibrim's picture
Upload 20 files
b347aa0 verified
raw
history blame contribute delete
679 Bytes
from minbpe import RegexTokenizer
# Initialize the tokenizer
tokenizer = RegexTokenizer()
# Read text from a file
file_path = "/Users/mohammad.ibrahim/Desktop/TSAI/combined_text.txt"
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
# Train the tokenizer
tokenizer.train(text, 256 + 5) # 256 are the byte tokens, then do 3 merges
# Encode the text
encoded_text = tokenizer.encode(text)
print("Encoded:", encoded_text)
# Decode the text
decoded_text = tokenizer.decode(encoded_text)
print("Decoded:", decoded_text)
# Save the trained tokenizer model
tokenizer.save("first") # Writes two files: toy.model (for loading) and toy.vocab (for viewing)