viditk's picture
Upload 134 files
d44849f verified
raw
history blame contribute delete
805 Bytes
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# Load your custom tokenizer (adjust paths)
tokenizer = AutoTokenizer.from_pretrained("IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/my_tokenizer/tokenizer.json")
# Load IndicTrans2 model properly
model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-indic-en-1B", ignore_mismatched_sizes=True)
# Resize embeddings to match new tokenizer
model.resize_token_embeddings(len(tokenizer))
# Save updated model & tokenizer
model.save_pretrained("IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/my_tokenizer/converted_tokenizer/new_model")
tokenizer.save_pretrained("IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/my_tokenizer/converted_tokenizer/new_model")