Spaces:
Runtime error
Runtime error
File size: 3,057 Bytes
dbcecf3 4b6c8e0 d347764 4b6c8e0 d347764 dbcecf3 d347764 1622d47 4b6c8e0 d347764 4b6c8e0 d347764 4b6c8e0 d347764 4b6c8e0 d347764 4b6c8e0 d347764 4b6c8e0 d347764 4b6c8e0 d347764 a2dc490 f805e49 1622d47 f805e49 c737803 d347764 226ec3a d347764 f805e49 d347764 c737803 4b6c8e0 c737803 d347764 4b6c8e0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
# -*- coding: utf-8 -*-
# !pip -q uninstall gradio -y
# !pip -q install gradio==3.50.2
# !pip -q install datasets
import gradio as gr
import numpy as np
import torch
from datasets import load_dataset
from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline, WhisperProcessor
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# load speech translation checkpoint
asr_pipe = pipeline("automatic-speech-recognition", model="voidful/wav2vec2-xlsr-multilingual-56", device=device)
processor = WhisperProcessor.from_pretrained(
"openai/whisper-small")
translator1 = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
translator2 = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ru")
from transformers import VitsModel, VitsTokenizer
model = VitsModel.from_pretrained("facebook/mms-tts-rus")
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-rus")
def translator_mul_ru(text):
translation = translator2(translator1(text)[0]['translation_text'])
return translation[0]['translation_text']
def translate(audio):
outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
return outputs["text"]
def synthesise(text):
translated_text = translator_mul_ru(text)
inputs = tokenizer(translated_text, return_tensors="pt")
input_ids = inputs["input_ids"]
with torch.no_grad():
outputs = model(input_ids)
speech = outputs["waveform"]
return speech.cpu()
def speech_to_speech_translation(audio):
translated_text = translate(audio)
print(translated_text)
synthesised_speech = synthesise(translated_text)
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
return 16000, synthesised_speech[0]
title = "Cascaded STST. Any language to russian"
description = """
* В качестве ASR модели была выбрана - https://huggingface.co/voidful/wav2vec2-xlsr-multilingual-56, если поставить фильтры multilingual и wav2vec, то эта модель самая популярная после фейсбуковских -
https://imgur.com/UNH5ym1
* Далее идет перевод с языка, на котором была запись, на английский, и после этого на русский
* Потом переведенный текст воспроизводится на русском языке
"""
demo = gr.Blocks()
mic_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
title=title,
description=description,
)
file_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="upload", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
title=title,
description=description,
)
with demo:
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "File"])
demo.launch()
|