File size: 2,280 Bytes
bcb5aac
2dcbe47
 
 
 
 
d347764
 
 
 
bcb5aac
ecfd58a
d347764
 
 
bcb5aac
dd057ae
2dcbe47
bcb5aac
d771f86
d347764
bcb5aac
d771f86
 
 
 
 
2dcbe47
d347764
62a7e0b
 
 
1f173de
d347764
fe3bd76
1c1219d
 
 
d347764
 
2dcbe47
d347764
 
7aec40a
d347764
f805e49
c737803
 
 
d347764
226ec3a
b2a79a9
d347764
c737803
 
 
 
 
fbba0c1
c737803
 
 
3946ba6
c737803
7aec40a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""
Automatically generated by Colaboratory.
Original file is located at
    https://colab.research.google.com/drive/16MxXQeF3O0htL9eQ61aa6ZxnApGg9TKN
"""

import gradio as gr
import numpy as np
import torch

from transformers import pipeline
from transformers import VitsModel, VitsTokenizer, FSMTForConditionalGeneration, FSMTTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor, MarianMTModel, MarianTokenizer, T5ForConditionalGeneration, T5Tokenizer

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Transform audio to en text
asr_pipe = pipeline("automatic-speech-recognition", model="asapp/sew-d-mid-400k-ft-ls100h", device=device)

# Translate en to rus text
translation_en_to_rus = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ru")

# Create speech from rus text
#vits_model = VitsModel.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
#vits_tokenizer = VitsTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-en")

model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-en")


def translator(audio):
  en_text = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
  translated_text = translation_en_to_rus(en_text["text"])
  return translated_text[0]['translation_text']

def synthesise(text):
    translated_text = translator(text)
    speech = model.generate(**tokenizer(translated_text, return_tensors="pt"))
    return speech.to("cpu")

def speech_to_speech_translation(audio):
    translated_text = transform_audio_to_speech_en(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return 16000, synthesised_speech[0]


demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy")
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    examples=[["./example.wav"]]
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.launch()