File size: 2,869 Bytes
a598796 d347764 d783d44 d347764 a598796 11f4b66 a598796 d347764 a598796 d347764 a598796 d347764 a598796 d347764 a598796 d347764 a598796 d347764 a598796 f805e49 a598796 66855f9 f805e49 c737803 d347764 226ec3a d347764 f805e49 d347764 c737803 a598796 c737803 d347764 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
"""Pronkin_hw_task3.ipynb
https://colab.research.google.com/drive/149j9u-wsD3GiEwRA8clBrXQ8bh5DRk7I?usp=sharing
"""
import gradio as gr
import numpy as np
import torch
from datasets import load_dataset
from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline, WhisperProcessor, VitsModel, VitsTokenizer
device = "cuda:0" if torch.cuda.is_available() else "cpu"
asr_pipe = pipeline("automatic-speech-recognition", model="voidful/wav2vec2-xlsr-multilingual-56", device=device)
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
translator_1 = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
translator_2 = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ru")
model = VitsModel.from_pretrained("facebook/mms-tts-rus")
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-rus")
def translator_mul_ru(text):
translation = translator_2(translator_1(text)[0]['translation_text'])
return translation[0]['translation_text']
def translate(audio):
outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
return outputs["text"]
def synthesise(text):
translated_text = translator_mul_ru(text)
inputs = tokenizer(translated_text, return_tensors="pt")
input_ids = inputs["input_ids"]
with torch.no_grad():
outputs = model(input_ids)
speech = outputs["waveform"]
return speech.cpu()
def speech_to_speech_translation(audio):
translated_text = translate(audio)
print(translated_text)
synthesised_speech = synthesise(translated_text)
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
return 16000, synthesised_speech[0]
title = "Pronkin custom STST"
description = """
* ASR-модель распознает речь с помощью voidful/wav2vec2-xlsr-multilingual-56 и возвращает текст на любом из 56 языков.
* Перевод текста с любого на английский с помощью модели Helsinki-NLP/opus-mt-mul-en, с английского на русский - Helsinki-NLP/opus-mt-en-ru
* Синтез речи на русском языке с помощью модели facebook/mms-tts-rus
"""
demo = gr.Blocks()
mic_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
title=title,
description=description,
)
file_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="upload", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
title=title,
description=description,
)
with demo:
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "File"])
demo.launch()
|