from minbpe import RegexTokenizer # Initialize the tokenizer tokenizer = RegexTokenizer() # Read text from a file file_path = "/Users/mohammad.ibrahim/Desktop/TSAI/combined_text.txt" with open(file_path, 'r', encoding='utf-8') as file: text = file.read() # Train the tokenizer tokenizer.train(text, 256 + 5) # 256 are the byte tokens, then do 3 merges # Encode the text encoded_text = tokenizer.encode(text) print("Encoded:", encoded_text) # Decode the text decoded_text = tokenizer.decode(encoded_text) print("Decoded:", decoded_text) # Save the trained tokenizer model tokenizer.save("first") # Writes two files: toy.model (for loading) and toy.vocab (for viewing)