viditk's picture
Upload 134 files
d44849f verified
raw
history blame contribute delete
500 Bytes
import json
# Path to your vocab.json
vocab_path = "/Users/apple/Desktop/indictrans2/IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/my_tokenizer/vocab.json"
# Load vocab.json
with open(vocab_path, "r", encoding="utf-8") as f:
vocab = json.load(f)
# Print vocab size & first few entries
print(f"Vocab size: {len(vocab)}")
print("Sample tokens:")
for i, (token, idx) in enumerate(vocab.items()):
print(f"{token}: {idx}")
if i >= 10: # Limit output
break