Spaces:
Runtime error
Runtime error
File size: 3,780 Bytes
4127c5c d347764 4127c5c d347764 4127c5c d347764 4127c5c d347764 4127c5c d347764 45c8117 d347764 4127c5c d347764 4127c5c 45c8117 4127c5c 45c8117 d347764 4127c5c d347764 4127c5c d347764 f805e49 9bf9f5e 7a93956 9bf9f5e f805e49 c737803 d347764 226ec3a d347764 f805e49 d347764 c737803 4127c5c c737803 7a93956 4127c5c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
# -*- coding: utf-8 -*-
"""HW3_ml.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1z4ht7K9pttbgWmDDnrQhqoZ6SYAiaeUe
"""
# !pip -q uninstall gradio -y
# !pip -q install gradio==3.50.2
# !pip -q install datasets
import gradio as gr
import numpy as np
import torch
from datasets import load_dataset
from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline, WhisperProcessor
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# load speech translation checkpoint
asr_pipe = pipeline("automatic-speech-recognition", model="voidful/wav2vec2-xlsr-multilingual-56", device=device)
# !pip -q install sentencepiece
# load text-to-speech checkpoint and speaker embeddings
# processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
processor = WhisperProcessor.from_pretrained(
"openai/whisper-small")
translator1 = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
translator2 = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ru")
from transformers import VitsModel, VitsTokenizer
# model = pipeline("text-to-speech", model="suno/bark-small")
model = VitsModel.from_pretrained("facebook/mms-tts-rus")
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-rus")
def translator_mul_ru(text):
translation = translator2(translator1(text)[0]['translation_text'])
return translation[0]['translation_text']
def translate(audio):
outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
return outputs["text"]
def synthesise(text):
translated_text = translator_mul_ru(text)
inputs = tokenizer(translated_text, return_tensors="pt")
input_ids = inputs["input_ids"]
with torch.no_grad():
outputs = model(input_ids)
speech = outputs["waveform"]
return speech.cpu()
def speech_to_speech_translation(audio):
translated_text = translate(audio)
print(translated_text)
synthesised_speech = synthesise(translated_text)
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
return 16000, synthesised_speech[0]
title = "Cascaded STST"
description = """
* Сначала модель распознает речь с помощью voidful/wav2vec2-xlsr-multilingual-56 и возвращает текст на любом из 56 языков.
* Далее происходит перевод текста с любого на английский с помощью Helsinki-NLP/opus-mt-mul-en, а затем с английского на русский также с помощью Helsinki-NLP/opus-mt-en-ru
* В конце осуществляется воспроизведение русского текста моделью facebook/mms-tts-rus
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Russian. Demo uses facebook/mms-tts-rus model for text-to-speech:
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
demo = gr.Blocks()
mic_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
title=title,
description=description,
)
file_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="upload", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
title=title,
description=description,
)
with demo:
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "File"])
demo.launch()
|