# Import NeMo and it's ASR, NLP and TTS collections import nemo # Import Speech Recognition collection import nemo.collections.asr as nemo_asr # Import Natural Language Processing colleciton import nemo.collections.nlp as nemo_nlp # Import Speech Synthesis collection import nemo.collections.tts as nemo_tts from nemo.collections.nlp.models.dialogue.dialogue_zero_shot_intent_model import DialogueZeroShotIntentModel import whisper from .utils import measure_time class SpeechTranslate(): @measure_time def __init__(self,intents=None): # Next, we instantiate all the necessary models directly from NVIDIA NGC # Speech Recognition model - QuartzNet trained on Russian part of MCV 6.0 self.intent_label= intents self.intent_model=DialogueZeroShotIntentModel.from_pretrained("zeroshotintent_en_bert_base_uncased").eval() self.transcription= whisper.load_model("base") # Neural Machine Translation model self.nmt_model = nemo_nlp.models.MTEncDecModel.from_pretrained(model_name='nmt_de_en_transformer24x6').eval() self.nmt_model_de = nemo_nlp.models.MTEncDecModel.from_pretrained(model_name='nmt_en_de_transformer24x6').eval() # Spectrogram generator which takes text as an input and produces spectrogram self.spectrogram_generator = nemo_tts.models.FastPitchModel.from_pretrained(model_name="tts_de_fastpitch_singlespeaker").eval() # Vocoder model which takes spectrogram and produces actual audio self.vocoder = nemo_tts.models.HifiGanModel.from_pretrained(model_name="tts_de_slr_hifigan_ft_fastpitch_singlespeaker").eval() @measure_time def translate(self,speechfile): # Transcribe an audio file # IMPORTANT: The audio must be mono with 16Khz sampling rate text = self.transcription.transcribe(speechfile) # You should see russian text here. Let's translate it to English if text["language"]=="de": english_text = self.nmt_model.translate([text["text"]]) elif text["language"]=="en": english_text=text["text"] else: raise NotImplementedError(f"Language: {text['language']} currently not supported") if self.intent_label is None: self.text = self.nmt_model_de.translate(english_text) else: self.text=english_text # After this you should see English translation # Let's convert it into audio # A helper function which combines FastPitch and HiFiGAN to go directly from # text to audio @measure_time def get_intent(self): intents = self.intent_model.predict([self.text[0]],self.intent_label) intent = [f"This is a {intents[0]['labels'][0]}, I will route you to the corresponding department"] print(intents) intenti = self.nmt_model_de.translate(intent) return intenti,intents[0]['labels'][0] @measure_time def text_to_audio(self): parsed = self.spectrogram_generator.parse(self.text[0]) spectrogram = self.spectrogram_generator.generate_spectrogram(tokens=parsed) audio = self.vocoder.convert_spectrogram_to_audio(spec=spectrogram) return audio.to('cpu').detach().numpy() @measure_time def process(self,speechfile,intents): self.intent_label = intents.split(",") if intents is not None else None self.translate(speechfile) if self.intent_label is not None: self.text,intent = self.get_intent() return self.text_to_audio(),intent