File size: 500 Bytes
d44849f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import json

# Path to your vocab.json
vocab_path = "/Users/apple/Desktop/indictrans2/IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/my_tokenizer/vocab.json"

# Load vocab.json
with open(vocab_path, "r", encoding="utf-8") as f:
    vocab = json.load(f)

# Print vocab size & first few entries
print(f"Vocab size: {len(vocab)}")
print("Sample tokens:")
for i, (token, idx) in enumerate(vocab.items()):
    print(f"{token}: {idx}")
    if i >= 10:  # Limit output
        break