Spaces:
Sleeping
Sleeping
import gradio as gr | |
from faster_whisper import WhisperModel | |
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast | |
from utils import lang_ids | |
model_size = "medium" | |
ts_model = WhisperModel(model_size, device = "cpu", compute_type = "int8") | |
lang_list = list(lang_ids.keys()) | |
def translate_audio(inputs,target_language): | |
if inputs is None: | |
raise gr.Error("No audio file submitted! Please upload an audio file before submitting your request.") | |
segments, _ = ts_model.transcribe(inputs, task="translate") | |
target_lang = lang_ids[target_language] | |
if target_language == 'English': | |
lst = '' | |
for segment in segments: | |
lst = lst + segment.text | |
return lst | |
else: | |
model = MBartForConditionalGeneration.from_pretrained("sanjitaa/mbart-many-to-many") | |
tokenizer = MBart50TokenizerFast.from_pretrained("sanjitaa/mbart-many-to-many") | |
tokenizer.src_lang = "en_XX" | |
translated_text = '' | |
for segment in segments: | |
encoded_chunk = tokenizer(segment.text, return_tensors="pt") | |
generated_tokens = model.generate( | |
**encoded_chunk, | |
forced_bos_token_id=tokenizer.lang_code_to_id[target_lang] | |
) | |
translated_chunk = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) | |
translated_text = translated_text + translated_chunk[0] | |
return translated_text | |
translation_interface = gr.Interface( | |
fn=translate_audio, | |
inputs=[ | |
gr.inputs.Audio(source="upload", type="filepath", label="Audio file"), | |
gr.Dropdown(lang_list, value="English", label="Target Language"), | |
], | |
outputs="text", | |
layout="horizontal", | |
theme="huggingface", | |
title="Translate Audio to English", | |
description=( | |
"Translate audio inputs to English using the" | |
), | |
allow_flagging="never", | |
) | |
if __name__ == "__main__": | |
translation_interface.launch() | |