Spaces:
Runtime error
Runtime error
File size: 3,208 Bytes
d347764 21a68b4 d347764 24f389a 7e3558a d347764 90bab96 d347764 7e3558a 9caff82 928acd7 9caff82 d347764 9caff82 928acd7 9caff82 d347764 7e3558a 5aa8aa7 9caff82 7e3558a d7e755b 7e3558a d347764 48cb862 d347764 5aa8aa7 90bab96 5aa8aa7 a608059 d347764 f805e49 c6f1d54 f805e49 c737803 d347764 226ec3a d347764 f805e49 d347764 c737803 3946ba6 c737803 70a5583 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import gradio as gr
import numpy as np
import torch
from datasets import load_dataset
import librosa
from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from transformers import VitsModel, VitsTokenizer
target_dtype = np.int16
max_range = np.iinfo(target_dtype).max
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model_mms = VitsModel.from_pretrained("facebook/mms-tts-nld")
tokenizer_mms = VitsTokenizer.from_pretrained("facebook/mms-tts-nld")
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
forced_decoder_ids = processor.get_decoder_prompt_ids(language="nl", task="transcribe")
sampling_rate = processor.feature_extractor.sampling_rate
def translate(audio):
input_features = processor(audio,sampling_rate=sampling_rate,return_tensors="pt").input_features
predicted_ids = model.generate(input_features,forced_decoder_ids=forced_decoder_ids)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
return transcription
def synthesise(text):
print("text",text)
inputs = tokenizer_mms(text[0], return_tensors="pt")
input_ids = inputs["input_ids"]
with torch.no_grad():
outputs = model_mms(input_ids)
speech = outputs["waveform"]
return speech
def speech_to_speech_translation(audio):
sampling_rate = 16000
data_array,samplerate = librosa.load(audio)
data_16 = librosa.resample(data_array, orig_sr=samplerate, target_sr=sampling_rate)
translated_text = translate(data_16)
synthesised_speech = synthesise(translated_text)
print("max_range",max_range)
synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
print("sampling_rate",sampling_rate)
print("synthesised_speech",synthesised_speech)
return sampling_rate, synthesised_speech.T
title = "Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
[SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
demo = gr.Blocks()
mic_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
title=title,
description=description,
)
file_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="upload", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
examples=[["./example.wav"]],
title=title,
description=description,
)
with demo:
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
demo.launch()
|