Spaces:
Runtime error
Runtime error
from minbpe import RegexTokenizer | |
# Initialize the tokenizer | |
tokenizer = RegexTokenizer() | |
# Read text from a file | |
file_path = "/Users/mohammad.ibrahim/Desktop/TSAI/combined_text.txt" | |
with open(file_path, 'r', encoding='utf-8') as file: | |
text = file.read() | |
# Train the tokenizer | |
tokenizer.train(text, 256 + 5) # 256 are the byte tokens, then do 3 merges | |
# Encode the text | |
encoded_text = tokenizer.encode(text) | |
print("Encoded:", encoded_text) | |
# Decode the text | |
decoded_text = tokenizer.decode(encoded_text) | |
print("Decoded:", decoded_text) | |
# Save the trained tokenizer model | |
tokenizer.save("first") # Writes two files: toy.model (for loading) and toy.vocab (for viewing) | |