Spaces:

fathyshalab
/

nemointent

Runtime error

App Files Files Community

nemointent / SpeechIntent.py

fathyshalab

Create SpeechIntent.py

5d0c495 almost 2 years ago

raw

history blame contribute delete

3.34 kB

	# Import NeMo and it's ASR, NLP and TTS collections
	import nemo
	# Import Speech Recognition collection
	import nemo.collections.asr as nemo_asr
	# Import Natural Language Processing colleciton
	import nemo.collections.nlp as nemo_nlp
	# Import Speech Synthesis collection
	import nemo.collections.tts as nemo_tts
	from nemo.collections.nlp.models.dialogue.dialogue_zero_shot_intent_model import DialogueZeroShotIntentModel
	import whisper
	from .utils import measure_time


	class SpeechTranslate():
	@measure_time
	def __init__(self,intents=None):
	# Next, we instantiate all the necessary models directly from NVIDIA NGC
	# Speech Recognition model - QuartzNet trained on Russian part of MCV 6.0
	self.intent_label= intents
	self.intent_model=DialogueZeroShotIntentModel.from_pretrained("zeroshotintent_en_bert_base_uncased").eval()
	self.transcription= whisper.load_model("base")
	# Neural Machine Translation model
	self.nmt_model = nemo_nlp.models.MTEncDecModel.from_pretrained(model_name='nmt_de_en_transformer24x6').eval()
	self.nmt_model_de = nemo_nlp.models.MTEncDecModel.from_pretrained(model_name='nmt_en_de_transformer24x6').eval()
	# Spectrogram generator which takes text as an input and produces spectrogram
	self.spectrogram_generator = nemo_tts.models.FastPitchModel.from_pretrained(model_name="tts_de_fastpitch_singlespeaker").eval()
	# Vocoder model which takes spectrogram and produces actual audio
	self.vocoder = nemo_tts.models.HifiGanModel.from_pretrained(model_name="tts_de_slr_hifigan_ft_fastpitch_singlespeaker").eval()
	@measure_time
	def translate(self,speechfile):
	# Transcribe an audio file
	# IMPORTANT: The audio must be mono with 16Khz sampling rate
	text = self.transcription.transcribe(speechfile)
	# You should see russian text here. Let's translate it to English

	if text["language"]=="de":
	english_text = self.nmt_model.translate([text["text"]])
	elif text["language"]=="en":
	english_text=text["text"]
	else:
	raise NotImplementedError(f"Language: {text['language']} currently not supported")
	if self.intent_label is None:
	self.text = self.nmt_model_de.translate(english_text)

	else:
	self.text=english_text
	# After this you should see English translation
	# Let's convert it into audio
	# A helper function which combines FastPitch and HiFiGAN to go directly from
	# text to audio
	@measure_time
	def get_intent(self):
	intents = self.intent_model.predict([self.text[0]],self.intent_label)
	intent = [f"This is a {intents[0]['labels'][0]}, I will route you to the corresponding department"]
	print(intents)
	intenti = self.nmt_model_de.translate(intent)
	return intenti,intents[0]['labels'][0]

	@measure_time
	def text_to_audio(self):
	parsed = self.spectrogram_generator.parse(self.text[0])
	spectrogram = self.spectrogram_generator.generate_spectrogram(tokens=parsed)
	audio = self.vocoder.convert_spectrogram_to_audio(spec=spectrogram)
	return audio.to('cpu').detach().numpy()
	@measure_time
	def process(self,speechfile,intents):
	self.intent_label = intents.split(",") if intents is not None else None
	self.translate(speechfile)
	if self.intent_label is not None:
	self.text,intent = self.get_intent()
	return self.text_to_audio(),intent