File size: 805 Bytes
d44849f
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load your custom tokenizer (adjust paths)
tokenizer = AutoTokenizer.from_pretrained("IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/my_tokenizer/tokenizer.json")

# Load IndicTrans2 model properly
model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-indic-en-1B", ignore_mismatched_sizes=True)

# Resize embeddings to match new tokenizer
model.resize_token_embeddings(len(tokenizer))

# Save updated model & tokenizer
model.save_pretrained("IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/my_tokenizer/converted_tokenizer/new_model")
tokenizer.save_pretrained("IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/my_tokenizer/converted_tokenizer/new_model")