import sentencepiece as spm import json # Load SentencePiece model directly sp = spm.SentencePieceProcessor() sp.Load("/Users/apple/Desktop/indictrans2/IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/my_tokenizer/custom_tokenizer.model") # Extract vocab vocab = {} for i in range(sp.GetPieceSize()): piece = sp.IdToPiece(i) vocab[piece] = i print(f"Total vocab size: {len(vocab)}") # Save vocab as vocab.json vocab_save_path = "/Users/apple/Desktop/indictrans2/IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/my_tokenizer/vocab.json" with open(vocab_save_path, 'w', encoding='utf-8') as f: json.dump(vocab, f, ensure_ascii=False, indent=4) print(f"Vocab file saved at: {vocab_save_path}")