Spaces:
Build error
Build error
File size: 2,137 Bytes
36bec1c 4adb977 ec72da9 bbee8bf bc7920f bbee8bf 0ce7006 b3ba25a bbee8bf 93f702e 15e6fbb cffc29a c3f4b33 cffc29a bbee8bf ec72da9 a66dfeb bbee8bf ec72da9 bbee8bf bc7920f bbee8bf fb0bdd0 bbee8bf 53c1f5b bbee8bf 09b2c6d 45b71c9 bbee8bf 45b71c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import gradio as gr
from gradio import components
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
import soundfile as sf
import os
import numpy as np
def translate_speech_to_speech(input_audio):
if input_audio.ndim == 2:
input_audio = np.mean(input_audio, axis=1)
# Save the input audio to a temporary file
input_file = "input_audio.wav"
sf.write(input_file, input_audio, 16000)
# Language detection and translation code from the first code snippet
model = whisper.load_model("base")
audio = whisper.load_audio(input_file)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
_, probs = model.detect_language(mel)
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)
text = result.text
lang = max(probs, key=probs.get)
# Translation code from the first code snippet
to_lang = 'ru'
tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
tokenizer.src_lang = lang
encoded_bg = tokenizer(text, return_tensors="pt")
generated_tokens = model.generate(**encoded_bg)
translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
# Text-to-speech (TTS) code from the first code snippet
tts = gTTS(text=translated_text, lang=to_lang)
output_file = "translated_speech.wav"
tts.save(output_file)
# Load the translated audio and return as an output
translated_audio, _ = sf.read(output_file, dtype="int16")
return translated_audio
title = "Speech-to-Speech Translator"
input_audio = gr.inputs.Audio(source="microphone")
output_audio = gr.outputs.Audio(type="numpy")
stt_demo = gr.Interface(
fn=translate_speech_to_speech,
inputs=input_audio,
outputs=output_audio,
title=title,
description="Speak in any language, and the translator will convert it to speech in the target language.",
)
if __name__ == "__main__":
stt_demo.launch() |