File size: 2,869 Bytes
a598796
 
 
 
d347764
 
 
 
 
d783d44
d347764
 
 
 
a598796
 
11f4b66
a598796
 
 
d347764
 
a598796
 
d347764
 
a598796
 
 
 
d347764
 
 
 
 
 
a598796
 
 
 
 
 
 
d347764
 
 
 
 
a598796
d347764
 
a598796
d347764
 
a598796
f805e49
a598796
 
66855f9
f805e49
 
c737803
 
 
d347764
226ec3a
d347764
f805e49
 
d347764
c737803
 
 
 
 
 
 
 
 
 
a598796
c737803
d347764
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""Pronkin_hw_task3.ipynb
https://colab.research.google.com/drive/149j9u-wsD3GiEwRA8clBrXQ8bh5DRk7I?usp=sharing
"""

import gradio as gr
import numpy as np
import torch
from datasets import load_dataset

from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline, WhisperProcessor, VitsModel, VitsTokenizer


device = "cuda:0" if torch.cuda.is_available() else "cpu"

asr_pipe = pipeline("automatic-speech-recognition", model="voidful/wav2vec2-xlsr-multilingual-56", device=device)

processor = WhisperProcessor.from_pretrained("openai/whisper-small")

translator_1 = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
translator_2 = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ru")


model = VitsModel.from_pretrained("facebook/mms-tts-rus")
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-rus")


def translator_mul_ru(text):

    translation  = translator_2(translator_1(text)[0]['translation_text'])
    return translation[0]['translation_text']

def translate(audio):
    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
    return outputs["text"]

def synthesise(text):
    translated_text = translator_mul_ru(text)
    inputs = tokenizer(translated_text, return_tensors="pt")
    input_ids = inputs["input_ids"]

    with torch.no_grad():
        outputs = model(input_ids)
    speech = outputs["waveform"]
    return speech.cpu()


def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    print(translated_text)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return 16000, synthesised_speech[0]


title = "Pronkin custom STST"
description = """
* ASR-модель распознает речь с помощью voidful/wav2vec2-xlsr-multilingual-56 и возвращает текст на любом из 56 языков.
* Перевод текста с любого на английский с помощью модели Helsinki-NLP/opus-mt-mul-en, с английского на русский - Helsinki-NLP/opus-mt-en-ru 
* Синтез речи на русском языке с помощью модели facebook/mms-tts-rus
"""

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "File"])

demo.launch()