Spaces:
Sleeping
Sleeping
File size: 1,501 Bytes
d44849f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
from datasets import load_dataset
import sentencepiece as spm
import os
import shutil
# 1️⃣ Load dataset
dataset = load_dataset('nitikdias/malayala-english_medical_dataset')
# 2️⃣ Extract texts
ml_texts = dataset['train']['ml']
en_texts = dataset['train']['en']
# 3️⃣ Combine all texts
all_texts = ml_texts + en_texts
# 4️⃣ Save combined corpus to text file
corpus_file = "tokenizer_corpus.txt"
with open(corpus_file, "w", encoding="utf-8") as f:
for line in all_texts:
f.write(line.strip() + "\n")
print(f"Corpus saved at: {corpus_file}")
# 5️⃣ Train SentencePiece tokenizer
spm.SentencePieceTrainer.train(
input=corpus_file,
model_prefix='custom_tokenizer',
vocab_size=2500,
character_coverage=1.0,
model_type='bpe' # 'unigram' can also be used
)
print("Tokenizer training completed.")
print("Files generated:")
print("- custom_tokenizer.model")
print("- custom_tokenizer.vocab")
# 6️⃣ OPTIONAL: Move tokenizer files to a desired local folder
output_dir = "/Users/apple/Desktop/indictrans2/indictrans2/huggingface_interface/indictranstoolkit/tokenizer_training/my_tokenizer/" # Change to any folder you want
# Create folder if doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Move files
shutil.move("custom_tokenizer.model", os.path.join(output_dir, "custom_tokenizer.model"))
shutil.move("custom_tokenizer.vocab", os.path.join(output_dir, "custom_tokenizer.vocab"))
print(f"Tokenizer files saved locally at: {output_dir}")
|