from datasets import load_dataset import sentencepiece as spm import os import shutil # 1️⃣ Load dataset dataset = load_dataset('nitikdias/malayala-english_medical_dataset') # 2️⃣ Extract texts ml_texts = dataset['train']['ml'] en_texts = dataset['train']['en'] # 3️⃣ Combine all texts all_texts = ml_texts + en_texts # 4️⃣ Save combined corpus to text file corpus_file = "tokenizer_corpus.txt" with open(corpus_file, "w", encoding="utf-8") as f: for line in all_texts: f.write(line.strip() + "\n") print(f"Corpus saved at: {corpus_file}") # 5️⃣ Train SentencePiece tokenizer spm.SentencePieceTrainer.train( input=corpus_file, model_prefix='custom_tokenizer', vocab_size=2500, character_coverage=1.0, model_type='bpe' # 'unigram' can also be used ) print("Tokenizer training completed.") print("Files generated:") print("- custom_tokenizer.model") print("- custom_tokenizer.vocab") # 6️⃣ OPTIONAL: Move tokenizer files to a desired local folder output_dir = "/Users/apple/Desktop/indictrans2/indictrans2/huggingface_interface/indictranstoolkit/tokenizer_training/my_tokenizer/" # Change to any folder you want # Create folder if doesn't exist os.makedirs(output_dir, exist_ok=True) # Move files shutil.move("custom_tokenizer.model", os.path.join(output_dir, "custom_tokenizer.model")) shutil.move("custom_tokenizer.vocab", os.path.join(output_dir, "custom_tokenizer.vocab")) print(f"Tokenizer files saved locally at: {output_dir}")