Spaces:
Runtime error
Runtime error
File size: 2,072 Bytes
d7dfa49 da5250a 13b10f1 d7dfa49 292172d 6cfff67 d7dfa49 292172d 726d965 292172d 726d965 6cfff67 292172d 6cfff67 d7dfa49 292172d d7dfa49 292172d d7dfa49 726d965 292172d 6cfff67 3a6f579 292172d 6cfff67 292172d 0856a96 b2604a4 292172d 6cfff67 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
import gradio as gr
import os
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
import IPython.display as ipd
import numpy as np
# Load Whisper STT model
whisper_model = whisper.load_model("base")
# Load translation models
tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
def translate_speech(audio, target_lang):
# Save audio as a temporary file
audio_path = "recorded_audio.wav"
with open(audio_path, "wb") as f:
f.write(audio)
# Load audio
audio = whisper.load_audio(audio_path)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
# Detect language
_, probs = whisper_model.detect_language(mel)
lang = max(probs, key=probs.get)
# Decode audio into text
options = whisper.DecodingOptions()
result = whisper.decode(whisper_model, mel, options)
text = result.text
# Translate text
tokenizer.src_lang = lang
encoded_text = tokenizer(text, return_tensors="pt")
generated_tokens = model.generate(**encoded_text)
translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
# Text-to-speech (TTS)
tts = gTTS(text=translated_text, lang=target_lang)
audio_path = "translated_audio.mp3"
tts.save(audio_path)
return audio_path
def translate_speech_interface(audio, target_lang):
translated_audio = translate_speech(audio, target_lang)
translated_audio = open(translated_audio, "rb").read()
return translated_audio
# Define the Gradio interface
audio_recording = gr.inputs.Audio(source="microphone", type="numpy", label="Record your speech")
target_language = gr.inputs.Dropdown(["en", "ru", "fr"], label="Target Language")
output_audio = gr.outputs.Audio(type="numpy", label="Translated Audio")
gr.Interface(fn=translate_speech_interface, inputs=[audio_recording, target_language], outputs=output_audio, title="Speech Translator").launch()
|