Spaces:
Runtime error
Runtime error
File size: 3,341 Bytes
5d0c495 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# Import NeMo and it's ASR, NLP and TTS collections
import nemo
# Import Speech Recognition collection
import nemo.collections.asr as nemo_asr
# Import Natural Language Processing colleciton
import nemo.collections.nlp as nemo_nlp
# Import Speech Synthesis collection
import nemo.collections.tts as nemo_tts
from nemo.collections.nlp.models.dialogue.dialogue_zero_shot_intent_model import DialogueZeroShotIntentModel
import whisper
from .utils import measure_time
class SpeechTranslate():
@measure_time
def __init__(self,intents=None):
# Next, we instantiate all the necessary models directly from NVIDIA NGC
# Speech Recognition model - QuartzNet trained on Russian part of MCV 6.0
self.intent_label= intents
self.intent_model=DialogueZeroShotIntentModel.from_pretrained("zeroshotintent_en_bert_base_uncased").eval()
self.transcription= whisper.load_model("base")
# Neural Machine Translation model
self.nmt_model = nemo_nlp.models.MTEncDecModel.from_pretrained(model_name='nmt_de_en_transformer24x6').eval()
self.nmt_model_de = nemo_nlp.models.MTEncDecModel.from_pretrained(model_name='nmt_en_de_transformer24x6').eval()
# Spectrogram generator which takes text as an input and produces spectrogram
self.spectrogram_generator = nemo_tts.models.FastPitchModel.from_pretrained(model_name="tts_de_fastpitch_singlespeaker").eval()
# Vocoder model which takes spectrogram and produces actual audio
self.vocoder = nemo_tts.models.HifiGanModel.from_pretrained(model_name="tts_de_slr_hifigan_ft_fastpitch_singlespeaker").eval()
@measure_time
def translate(self,speechfile):
# Transcribe an audio file
# IMPORTANT: The audio must be mono with 16Khz sampling rate
text = self.transcription.transcribe(speechfile)
# You should see russian text here. Let's translate it to English
if text["language"]=="de":
english_text = self.nmt_model.translate([text["text"]])
elif text["language"]=="en":
english_text=text["text"]
else:
raise NotImplementedError(f"Language: {text['language']} currently not supported")
if self.intent_label is None:
self.text = self.nmt_model_de.translate(english_text)
else:
self.text=english_text
# After this you should see English translation
# Let's convert it into audio
# A helper function which combines FastPitch and HiFiGAN to go directly from
# text to audio
@measure_time
def get_intent(self):
intents = self.intent_model.predict([self.text[0]],self.intent_label)
intent = [f"This is a {intents[0]['labels'][0]}, I will route you to the corresponding department"]
print(intents)
intenti = self.nmt_model_de.translate(intent)
return intenti,intents[0]['labels'][0]
@measure_time
def text_to_audio(self):
parsed = self.spectrogram_generator.parse(self.text[0])
spectrogram = self.spectrogram_generator.generate_spectrogram(tokens=parsed)
audio = self.vocoder.convert_spectrogram_to_audio(spec=spectrogram)
return audio.to('cpu').detach().numpy()
@measure_time
def process(self,speechfile,intents):
self.intent_label = intents.split(",") if intents is not None else None
self.translate(speechfile)
if self.intent_label is not None:
self.text,intent = self.get_intent()
return self.text_to_audio(),intent
|