Spaces:
Sleeping
Sleeping
stt-model-ml-en
/
IndicTrans2
/huggingface_interface
/IndicTransToolkit
/tokenizer_training
/train_tokenizer.py
from datasets import load_dataset | |
import sentencepiece as spm | |
import os | |
import shutil | |
# 1️⃣ Load dataset | |
dataset = load_dataset('nitikdias/malayala-english_medical_dataset') | |
# 2️⃣ Extract texts | |
ml_texts = dataset['train']['ml'] | |
en_texts = dataset['train']['en'] | |
# 3️⃣ Combine all texts | |
all_texts = ml_texts + en_texts | |
# 4️⃣ Save combined corpus to text file | |
corpus_file = "tokenizer_corpus.txt" | |
with open(corpus_file, "w", encoding="utf-8") as f: | |
for line in all_texts: | |
f.write(line.strip() + "\n") | |
print(f"Corpus saved at: {corpus_file}") | |
# 5️⃣ Train SentencePiece tokenizer | |
spm.SentencePieceTrainer.train( | |
input=corpus_file, | |
model_prefix='custom_tokenizer', | |
vocab_size=2500, | |
character_coverage=1.0, | |
model_type='bpe' # 'unigram' can also be used | |
) | |
print("Tokenizer training completed.") | |
print("Files generated:") | |
print("- custom_tokenizer.model") | |
print("- custom_tokenizer.vocab") | |
# 6️⃣ OPTIONAL: Move tokenizer files to a desired local folder | |
output_dir = "/Users/apple/Desktop/indictrans2/indictrans2/huggingface_interface/indictranstoolkit/tokenizer_training/my_tokenizer/" # Change to any folder you want | |
# Create folder if doesn't exist | |
os.makedirs(output_dir, exist_ok=True) | |
# Move files | |
shutil.move("custom_tokenizer.model", os.path.join(output_dir, "custom_tokenizer.model")) | |
shutil.move("custom_tokenizer.vocab", os.path.join(output_dir, "custom_tokenizer.vocab")) | |
print(f"Tokenizer files saved locally at: {output_dir}") | |