File size: 3,057 Bytes
dbcecf3
4b6c8e0
 
 
 
 
 
d347764
 
 
 
 
4b6c8e0
d347764
 
 
 
dbcecf3
d347764
1622d47
4b6c8e0
 
d347764
4b6c8e0
 
d347764
4b6c8e0
d347764
4b6c8e0
 
 
 
 
 
 
d347764
 
 
 
 
 
 
4b6c8e0
 
 
 
 
 
 
d347764
 
 
 
 
4b6c8e0
d347764
 
4b6c8e0
d347764
a2dc490
f805e49
1622d47
 
 
 
 
f805e49
 
c737803
 
 
d347764
226ec3a
d347764
f805e49
 
d347764
c737803
 
 
 
 
 
 
 
 
 
4b6c8e0
c737803
d347764
4b6c8e0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# -*- coding: utf-8 -*-

# !pip -q uninstall gradio -y
# !pip -q install gradio==3.50.2

# !pip -q install datasets

import gradio as gr
import numpy as np
import torch
from datasets import load_dataset

from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline, WhisperProcessor

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# load speech translation checkpoint
asr_pipe = pipeline("automatic-speech-recognition", model="voidful/wav2vec2-xlsr-multilingual-56", device=device)


processor = WhisperProcessor.from_pretrained(
    "openai/whisper-small")

translator1 = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
translator2 = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ru")

from transformers import VitsModel, VitsTokenizer

model = VitsModel.from_pretrained("facebook/mms-tts-rus")
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-rus")

def translator_mul_ru(text):

    translation  = translator2(translator1(text)[0]['translation_text'])
    return translation[0]['translation_text']

def translate(audio):
    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
    return outputs["text"]


def synthesise(text):
    translated_text = translator_mul_ru(text)
    inputs = tokenizer(translated_text, return_tensors="pt")
    input_ids = inputs["input_ids"]

    with torch.no_grad():
        outputs = model(input_ids)
    speech = outputs["waveform"]
    return speech.cpu()


def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    print(translated_text)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return 16000, synthesised_speech[0]

title = "Cascaded STST. Any language to russian"
description = """
* В качестве ASR модели была выбрана - https://huggingface.co/voidful/wav2vec2-xlsr-multilingual-56, если поставить фильтры multilingual и wav2vec, то эта модель самая популярная после фейсбуковских - 
https://imgur.com/UNH5ym1
* Далее идет перевод с языка, на котором была запись, на английский, и после этого на русский 
* Потом переведенный текст воспроизводится на русском языке 

"""

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "File"])

demo.launch()