File size: 2,201 Bytes
bcb5aac 2dcbe47 d347764 bcb5aac ecfd58a d347764 bcb5aac c24e66a 2dcbe47 bcb5aac d771f86 d347764 bcb5aac 28d779c d771f86 c5ba654 2dcbe47 d347764 bcc1813 62a7e0b 1f173de d347764 76364f3 1c1219d d347764 de68bdb 76364f3 d347764 7aec40a d347764 c737803 d347764 226ec3a b2a79a9 d347764 c737803 fbba0c1 c737803 1d61d5d c737803 3946ba6 c737803 7aec40a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
"""
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/16MxXQeF3O0htL9eQ61aa6ZxnApGg9TKN
"""
import gradio as gr
import numpy as np
import torch
from transformers import pipeline
from transformers import VitsModel, VitsTokenizer, FSMTForConditionalGeneration, FSMTTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor, MarianMTModel, MarianTokenizer, T5ForConditionalGeneration, T5Tokenizer
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# Transform audio to en text
asr_pipe = pipeline("automatic-speech-recognition", model="asapp/sew-d-tiny-100k-ft-ls100h", device=device)
# Translate en to rus text
translation_en_to_rus = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ru")
# Create speech from rus text
model = VitsModel.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
tokenizer = VitsTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
#model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
#tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
def translate(audio):
en_text = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
translated_text = translation_en_to_rus(en_text["text"])
return translated_text[0]['translation_text']
def synthesise(text):
speech = model.generate(**tokenizer(text, return_tensors="pt"))
return speech.to("cpu")
def speech_to_speech_translation(audio):
translated_text = translate(audio)
synthesised_speech = synthesise(translated_text)
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
return 16000, synthesised_speech[0]
demo = gr.Blocks()
mic_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy")
)
file_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="upload", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
examples=[["./example.wav"]]
)
with demo:
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
demo.launch() |