Spaces:
Runtime error
Runtime error
File size: 1,912 Bytes
da5250a 05fd694 13b10f1 d7dfa49 292172d bda48ea 726d965 292172d 726d965 b9553d2 292172d 05fd694 292172d d7dfa49 726d965 292172d b9553d2 292172d b9553d2 292172d b9553d2 05fd694 292172d 05fd694 292172d 6cfff67 b9553d2 0856a96 b2604a4 b9553d2 05fd694 b9553d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import os
import gradio as gr
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
# Load Whisper STT model
whisper_model = whisper.load_model("base")
# Load translation models
tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
def translate_speech(audio, target_lang):
audio = audio.astype("float32")
audio = whisper.pad_or_trim(audio, whisper_model.audio_config.sample_rate)
mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
_, probs = whisper_model.detect_language(mel)
options = whisper.DecodingOptions(fp16=False)
result = whisper.decode(whisper_model, mel, options)
text = result.text
# Translate text
tokenizer.src_lang = target_lang # Assuming the input is always in English
encoded_text = tokenizer(text, return_tensors="pt")
generated_tokens = model.generate(**encoded_text)
translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
# Text-to-speech (TTS)
tts = gTTS(text=translated_text, lang=target_lang)
audio_path = "translated_audio.mp3"
tts.save(audio_path)
return audio_path
def translate_speech_interface(audio, target_lang):
translated_audio = translate_speech(audio, target_lang)
translated_audio_bytes = open(translated_audio, "rb").read()
return translated_audio_bytes
audio_recording = gr.inputs.Audio(source="microphone", type="numpy", label="Record your speech")
lang_choices = ["ru", "fr", "en", "de"]
lang_dropdown = gr.inputs.Dropdown(lang_choices, label="Select Language to Translate")
output_audio = gr.outputs.Audio(type="numpy", label="Translated Audio")
iface = gr.Interface(fn=translate_speech_interface, inputs=[audio_recording, lang_dropdown], outputs=output_audio, title="Speech Translator")
iface.launch()
|