Spaces:
Runtime error
Runtime error
File size: 3,050 Bytes
d347764 783e3c5 d347764 f9e92c9 d347764 783e3c5 7b3f433 783e3c5 d347764 a30997d d347764 f9e92c9 40d4679 1a56ce9 d347764 f9e92c9 d347764 f805e49 b97b97e f805e49 c737803 d347764 226ec3a d347764 f805e49 d347764 c737803 3946ba6 c737803 b98b69c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import gradio as gr
import numpy as np
import torch
from datasets import load_dataset
from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
from transformers import VitsModel, AutoTokenizer
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# load speech translation checkpoint
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
# load text-to-speech checkpoint and speaker embeddings
processor = SpeechT5Processor.from_pretrained("vadhri/speecht5_finetuned_voxpopuli_nl")
# model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
model = SpeechT5ForTextToSpeech.from_pretrained("vadhri/speecht5_finetuned_voxpopuli_nl").to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation", trust_remote_code=True)
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
def translate(audio):
outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
print ("Translated text", outputs["text"][:600])
return outputs["text"][:600]
def synthesise(text):
inputs = processor(text=text, return_tensors="pt")
speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
return speech.cpu()
def speech_to_speech_translation(audio):
translated_text = translate(audio)
synthesised_speech = synthesise(translated_text)
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
return 16000, synthesised_speech
title = "Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Dutch.
Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and
a custom fine tuned version of [Microsoft's SpeechT5 TTS] fine tuned for Dutch - vadhri/speecht5_finetuned_voxpopuli_nl (https://huggingface.co/vadhri/speecht5_finetuned_voxpopuli_nl) model for text-to-speech:

"""
demo = gr.Blocks()
mic_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
title=title,
description=description,
)
file_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="upload", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
examples=[["./example.wav"]],
title=title,
description=description,
)
with demo:
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
demo.launch()
|