File size: 752 Bytes
d44849f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import sentencepiece as spm
import json

# Load SentencePiece model directly
sp = spm.SentencePieceProcessor()
sp.Load("/Users/apple/Desktop/indictrans2/IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/my_tokenizer/custom_tokenizer.model")

# Extract vocab
vocab = {}
for i in range(sp.GetPieceSize()):
    piece = sp.IdToPiece(i)
    vocab[piece] = i

print(f"Total vocab size: {len(vocab)}")

# Save vocab as vocab.json
vocab_save_path = "/Users/apple/Desktop/indictrans2/IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/my_tokenizer/vocab.json"
with open(vocab_save_path, 'w', encoding='utf-8') as f:
    json.dump(vocab, f, ensure_ascii=False, indent=4)

print(f"Vocab file saved at: {vocab_save_path}")