cascaded_speech_to_speech_translation_in_deutsch

Sleeping

cascaded_speech_to_speech_translation_in_deutsch

File size: 2,738 Bytes

aa2ad2d
 
 
 
 
b0361e0
aa2ad2d
 
 
 
 
 
 
b0361e0
 
 
aa2ad2d
 
b0361e0
aa2ad2d
 
 
 
b0361e0
 
 
 
 
 
 
 
aa2ad2d
c705c41
 
 
aa2ad2d
 
 
 
c705c41
 
aa2ad2d
 
 
b0361e0
aa2ad2d
b0361e0
aa2ad2d
 
 
 
 
 
 
58e77db
 
92e4310
58e77db
 
 
 
 
 
aa2ad2d
 
 
 
 
 
 
 
 
 
 
 
58e77db
aa2ad2d

import gradio as gr
import numpy as np
import torch
from datasets import load_dataset

from transformers import VitsModel, VitsTokenizer, pipeline


device = "cuda:0" if torch.cuda.is_available() else "cpu"

# load speech translation checkpoint
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)

# loading the deutsch multilingual checkpoint
model = VitsModel.from_pretrained("facebook/mms-tts-deu")
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-deu")

def translate(audio):
    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe" , "language": "de"})
    return outputs["text"]


def synthesise(text):
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"]

    with torch.no_grad():
        outputs = model(input_ids)

    speech = outputs["waveform"]
    return speech

# converting the output audio array to int16,which is expected by gradio
target_dtype = np.int16
max_range = np.iinfo(target_dtype).max

def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    # converting for gradio
    synthesised_speech = (synthesised_speech.squeeze().numpy() * max_range).astype(np.int16)
    return 16000, synthesised_speech


title = "Cascaded Speech To Speech Translation in German"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in German. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Meta's [Massively Multilingual Speech German](https://huggingface.co/facebook/mms-tts-deu) model for text-to-speech.

The below diagram shows how the cascaded speech to speech translation works.
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(sources="microphone",label= "Audio", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)


file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(sources="upload", label="Audio file", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    examples=[["./example.wav"]],
    title=title,
    description=description,
    cache_examples=True,
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface([mic_translate,file_translate], ["Microphone","Audio File"])

demo.launch()