Spaces:
Sleeping
Sleeping
import sentencepiece as spm | |
import json | |
# Load SentencePiece model directly | |
sp = spm.SentencePieceProcessor() | |
sp.Load("/Users/apple/Desktop/indictrans2/IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/my_tokenizer/custom_tokenizer.model") | |
# Extract vocab | |
vocab = {} | |
for i in range(sp.GetPieceSize()): | |
piece = sp.IdToPiece(i) | |
vocab[piece] = i | |
print(f"Total vocab size: {len(vocab)}") | |
# Save vocab as vocab.json | |
vocab_save_path = "/Users/apple/Desktop/indictrans2/IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/my_tokenizer/vocab.json" | |
with open(vocab_save_path, 'w', encoding='utf-8') as f: | |
json.dump(vocab, f, ensure_ascii=False, indent=4) | |
print(f"Vocab file saved at: {vocab_save_path}") | |