Spaces:
Runtime error
Runtime error
File size: 2,045 Bytes
da5250a 05fd694 341a129 13b10f1 d7dfa49 292172d bda48ea 726d965 292172d 726d965 b9553d2 ae8d075 0d7f26b b9553d2 292172d 05fd694 292172d d7dfa49 726d965 292172d 29135e4 292172d b9553d2 292172d 29135e4 ae8d075 0d7f26b b9553d2 05fd694 292172d 05fd694 292172d 6cfff67 b9553d2 0856a96 b2604a4 b9553d2 05fd694 b9553d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import os
import gradio as gr
import numpy as np
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
# Load Whisper STT model
whisper_model = whisper.load_model("base")
# Load translation models
tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
def translate_speech(audio, target_lang):
if isinstance(audio, tuple):
audio = audio[0]
if isinstance(audio, int):
audio = [audio]
audio = np.array(audio).astype("float32") # Convert audio to float32
audio = whisper.pad_or_trim(audio, whisper_model.audio_config.sample_rate)
mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
_, probs = whisper_model.detect_language(mel)
options = whisper.DecodingOptions(fp16=False)
result = whisper.decode(whisper_model, mel, options)
text = result.text
# Translate text
tokenizer.src_lang = target_lang
encoded_text = tokenizer(text, return_tensors="pt")
generated_tokens = model.generate(**encoded_text)
translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
# Text-to-speech (TTS)
tts = gTTS(text=translated_text, lang=target_lang)
audio_path = "translated_audio.mp3"
tts.save(audio_path)
return audio_path
def translate_speech_interface(audio, target_lang):
translated_audio = translate_speech(audio, target_lang)
translated_audio_bytes = open(translated_audio, "rb").read()
return translated_audio_bytes
audio_recording = gr.inputs.Audio(source="microphone", type="numpy", label="Record your speech")
lang_choices = ["ru", "fr", "en", "de"]
lang_dropdown = gr.inputs.Dropdown(lang_choices, label="Select Language to Translate")
output_audio = gr.outputs.Audio(type="numpy", label="Translated Audio")
iface = gr.Interface(fn=translate_speech_interface, inputs=[audio_recording, lang_dropdown], outputs=output_audio, title="Speech Translator")
iface.launch()
|