viditk's picture
Upload 134 files
d44849f verified
from datasets import load_dataset
import sentencepiece as spm
import os
import shutil
# 1️⃣ Load dataset
dataset = load_dataset('nitikdias/malayala-english_medical_dataset')
# 2️⃣ Extract texts
ml_texts = dataset['train']['ml']
en_texts = dataset['train']['en']
# 3️⃣ Combine all texts
all_texts = ml_texts + en_texts
# 4️⃣ Save combined corpus to text file
corpus_file = "tokenizer_corpus.txt"
with open(corpus_file, "w", encoding="utf-8") as f:
for line in all_texts:
f.write(line.strip() + "\n")
print(f"Corpus saved at: {corpus_file}")
# 5️⃣ Train SentencePiece tokenizer
spm.SentencePieceTrainer.train(
input=corpus_file,
model_prefix='custom_tokenizer',
vocab_size=2500,
character_coverage=1.0,
model_type='bpe' # 'unigram' can also be used
)
print("Tokenizer training completed.")
print("Files generated:")
print("- custom_tokenizer.model")
print("- custom_tokenizer.vocab")
# 6️⃣ OPTIONAL: Move tokenizer files to a desired local folder
output_dir = "/Users/apple/Desktop/indictrans2/indictrans2/huggingface_interface/indictranstoolkit/tokenizer_training/my_tokenizer/" # Change to any folder you want
# Create folder if doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Move files
shutil.move("custom_tokenizer.model", os.path.join(output_dir, "custom_tokenizer.model"))
shutil.move("custom_tokenizer.vocab", os.path.join(output_dir, "custom_tokenizer.vocab"))
print(f"Tokenizer files saved locally at: {output_dir}")