Spaces:
Build error
Build error
File size: 2,144 Bytes
36bec1c 4adb977 ec72da9 bbee8bf bc7920f bbee8bf 0ce7006 b3ba25a bbee8bf a96b473 c3f4b33 a96b473 bbee8bf ec72da9 a66dfeb bbee8bf ec72da9 bbee8bf bc7920f bbee8bf fb0bdd0 bbee8bf 53c1f5b bbee8bf 09b2c6d 45b71c9 bbee8bf 45b71c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import gradio as gr
from gradio import components
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
import soundfile as sf
import os
import numpy as np
def translate_speech_to_speech(input_audio):
input_audio, sample_rate = input_tuple
# Save the input audio to a temporary file
input_file = "input_audio.wav"
sf.write(input_file, input_audio, sample_rate) # use the sample rate from Gradio
# Language detection and translation code from the first code snippet
model = whisper.load_model("base")
audio = whisper.load_audio(input_file)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
_, probs = model.detect_language(mel)
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)
text = result.text
lang = max(probs, key=probs.get)
# Translation code from the first code snippet
to_lang = 'ru'
tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
tokenizer.src_lang = lang
encoded_bg = tokenizer(text, return_tensors="pt")
generated_tokens = model.generate(**encoded_bg)
translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
# Text-to-speech (TTS) code from the first code snippet
tts = gTTS(text=translated_text, lang=to_lang)
output_file = "translated_speech.wav"
tts.save(output_file)
# Load the translated audio and return as an output
translated_audio, _ = sf.read(output_file, dtype="int16")
return translated_audio
title = "Speech-to-Speech Translator"
input_audio = gr.inputs.Audio(source="microphone")
output_audio = gr.outputs.Audio(type="numpy")
stt_demo = gr.Interface(
fn=translate_speech_to_speech,
inputs=input_audio,
outputs=output_audio,
title=title,
description="Speak in any language, and the translator will convert it to speech in the target language.",
)
if __name__ == "__main__":
stt_demo.launch() |