Spaces:
Sleeping
Sleeping
ml-en-stt-model
/
IndicTrans2
/huggingface_interface
/IndicTransToolkit
/tokenizer_training
/my_tokenizer
/preprocessing.py
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
# Load your custom tokenizer (adjust paths) | |
tokenizer = AutoTokenizer.from_pretrained("IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/my_tokenizer/tokenizer.json") | |
# Load IndicTrans2 model properly | |
model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-indic-en-1B", ignore_mismatched_sizes=True) | |
# Resize embeddings to match new tokenizer | |
model.resize_token_embeddings(len(tokenizer)) | |
# Save updated model & tokenizer | |
model.save_pretrained("IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/my_tokenizer/converted_tokenizer/new_model") | |
tokenizer.save_pretrained("IndicTrans2/huggingface_interface/IndicTransToolkit/tokenizer_training/my_tokenizer/converted_tokenizer/new_model") | |