import gradio as gr from faster_whisper import WhisperModel from transformers import MBartForConditionalGeneration, MBart50TokenizerFast from utils import lang_ids model_size = "medium" ts_model = WhisperModel(model_size, device = "cpu", compute_type = "int8") lang_list = list(lang_ids.keys()) def translate_audio(inputs,target_language): if inputs is None: raise gr.Error("No audio file submitted! Please upload an audio file before submitting your request.") segments, _ = ts_model.transcribe(inputs, task="translate") target_lang = lang_ids[target_language] if target_language == 'English': lst = '' for segment in segments: lst = lst + segment.text return lst else: model = MBartForConditionalGeneration.from_pretrained("sanjitaa/mbart-many-to-many") tokenizer = MBart50TokenizerFast.from_pretrained("sanjitaa/mbart-many-to-many") tokenizer.src_lang = "en_XX" translated_text = '' for segment in segments: encoded_chunk = tokenizer(segment.text, return_tensors="pt") generated_tokens = model.generate( **encoded_chunk, forced_bos_token_id=tokenizer.lang_code_to_id[target_lang] ) translated_chunk = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) translated_text = translated_text + translated_chunk[0] return translated_text translation_interface = gr.Interface( fn=translate_audio, inputs=[ gr.inputs.Audio(source="upload", type="filepath", label="Audio file"), gr.Dropdown(lang_list, value="English", label="Target Language"), ], outputs="text", layout="horizontal", theme="huggingface", title="Translate Audio to English", description=( "Translate audio inputs to English using the" ), allow_flagging="never", ) if __name__ == "__main__": translation_interface.launch()