speech-to-speech-translation

Runtime error

File size: 3,631 Bytes

639349a
 
 
 
 
 
 
 
 
 
 
9cf5830
639349a
 
2fc73b8
 
65dbb51
d1fb32e
 
365a7ea
0e3853b
d1fb32e
639349a
d1fb32e
48e001b
6f4b9a7
65dbb51
6442b57
48e001b
f61aa66
639349a
 
 
 
 
48e001b
 
 
639349a
 
48e001b
33bb1b2
 
 
397665a
33bb1b2
 
 
639349a

import gradio as gr
import numpy as np
import torch
from datasets import load_dataset

from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline


device = "cuda:0" if torch.cuda.is_available() else "cpu"

# load speech translation checkpoint
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)

# load text-to-speech checkpoint and speaker embeddings
#processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
#Use own TTS Model
#processor = SpeechT5Processor.from_pretrained("jasonl1/speecht5_finetuned_voxpopuli_fi")
#processor = SpeechT5Processor.from_pretrained("sanchit-gandhi/speecht5_tts_vox_nl")
processor = SpeechT5Processor.from_pretrained("Salama1429/TTS_German_Speecht5_finetuned_voxpopuli_nl")
# Load model directly

model = SpeechT5ForTextToSpeech.from_pretrained("Salama1429/TTS_German_Speecht5_finetuned_voxpopuli_nl")

#model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
#Use own TTS Model

#model = SpeechT5ForTextToSpeech.from_pretrained("jasonl1/speecht5_finetuned_voxpopuli_fi",ignore_mismatched_sizes=True,)
#model = SpeechT5ForTextToSpeech.from_pretrained("sanchit-gandhi/speecht5_tts_vox_nl")

vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)


#def translate(audio):
#    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
#    return outputs["text"]


# Added to Base to translate from Language X to any Language Y ="fi" using "task": "transcribe"
# At Inference. it should use translate(sample["audio"].copy())

def translate(audio):
    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "de"})
    return outputs["text"]


def synthesise(text):
    inputs = processor(text=text, return_tensors="pt")
    speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
    return speech.cpu()


def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return 16000, synthesised_speech


title = "Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
[SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:

![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    examples=[["./example.wav"]],
    title=title,
    description=description,
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.launch()