Spaces:
Runtime error
Runtime error
# Import NeMo and it's ASR, NLP and TTS collections | |
import nemo | |
# Import Speech Recognition collection | |
import nemo.collections.asr as nemo_asr | |
# Import Natural Language Processing colleciton | |
import nemo.collections.nlp as nemo_nlp | |
# Import Speech Synthesis collection | |
import nemo.collections.tts as nemo_tts | |
from nemo.collections.nlp.models.dialogue.dialogue_zero_shot_intent_model import DialogueZeroShotIntentModel | |
import whisper | |
from .utils import measure_time | |
class SpeechTranslate(): | |
def __init__(self,intents=None): | |
# Next, we instantiate all the necessary models directly from NVIDIA NGC | |
# Speech Recognition model - QuartzNet trained on Russian part of MCV 6.0 | |
self.intent_label= intents | |
self.intent_model=DialogueZeroShotIntentModel.from_pretrained("zeroshotintent_en_bert_base_uncased").eval() | |
self.transcription= whisper.load_model("base") | |
# Neural Machine Translation model | |
self.nmt_model = nemo_nlp.models.MTEncDecModel.from_pretrained(model_name='nmt_de_en_transformer24x6').eval() | |
self.nmt_model_de = nemo_nlp.models.MTEncDecModel.from_pretrained(model_name='nmt_en_de_transformer24x6').eval() | |
# Spectrogram generator which takes text as an input and produces spectrogram | |
self.spectrogram_generator = nemo_tts.models.FastPitchModel.from_pretrained(model_name="tts_de_fastpitch_singlespeaker").eval() | |
# Vocoder model which takes spectrogram and produces actual audio | |
self.vocoder = nemo_tts.models.HifiGanModel.from_pretrained(model_name="tts_de_slr_hifigan_ft_fastpitch_singlespeaker").eval() | |
def translate(self,speechfile): | |
# Transcribe an audio file | |
# IMPORTANT: The audio must be mono with 16Khz sampling rate | |
text = self.transcription.transcribe(speechfile) | |
# You should see russian text here. Let's translate it to English | |
if text["language"]=="de": | |
english_text = self.nmt_model.translate([text["text"]]) | |
elif text["language"]=="en": | |
english_text=text["text"] | |
else: | |
raise NotImplementedError(f"Language: {text['language']} currently not supported") | |
if self.intent_label is None: | |
self.text = self.nmt_model_de.translate(english_text) | |
else: | |
self.text=english_text | |
# After this you should see English translation | |
# Let's convert it into audio | |
# A helper function which combines FastPitch and HiFiGAN to go directly from | |
# text to audio | |
def get_intent(self): | |
intents = self.intent_model.predict([self.text[0]],self.intent_label) | |
intent = [f"This is a {intents[0]['labels'][0]}, I will route you to the corresponding department"] | |
print(intents) | |
intenti = self.nmt_model_de.translate(intent) | |
return intenti,intents[0]['labels'][0] | |
def text_to_audio(self): | |
parsed = self.spectrogram_generator.parse(self.text[0]) | |
spectrogram = self.spectrogram_generator.generate_spectrogram(tokens=parsed) | |
audio = self.vocoder.convert_spectrogram_to_audio(spec=spectrogram) | |
return audio.to('cpu').detach().numpy() | |
def process(self,speechfile,intents): | |
self.intent_label = intents.split(",") if intents is not None else None | |
self.translate(speechfile) | |
if self.intent_label is not None: | |
self.text,intent = self.get_intent() | |
return self.text_to_audio(),intent | |