File size: 3,341 Bytes
5d0c495
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# Import NeMo and it's ASR, NLP and TTS collections
import nemo
# Import Speech Recognition collection
import nemo.collections.asr as nemo_asr
# Import Natural Language Processing colleciton
import nemo.collections.nlp as nemo_nlp
# Import Speech Synthesis collection
import nemo.collections.tts as nemo_tts
from nemo.collections.nlp.models.dialogue.dialogue_zero_shot_intent_model import DialogueZeroShotIntentModel
import whisper
from .utils import measure_time


class SpeechTranslate():
  @measure_time
  def __init__(self,intents=None):
    # Next, we instantiate all the necessary models directly from NVIDIA NGC
    # Speech Recognition model - QuartzNet trained on Russian part of MCV 6.0
    self.intent_label= intents
    self.intent_model=DialogueZeroShotIntentModel.from_pretrained("zeroshotintent_en_bert_base_uncased").eval()
    self.transcription= whisper.load_model("base")
    # Neural Machine Translation model
    self.nmt_model = nemo_nlp.models.MTEncDecModel.from_pretrained(model_name='nmt_de_en_transformer24x6').eval()
    self.nmt_model_de = nemo_nlp.models.MTEncDecModel.from_pretrained(model_name='nmt_en_de_transformer24x6').eval()
    # Spectrogram generator which takes text as an input and produces spectrogram
    self.spectrogram_generator = nemo_tts.models.FastPitchModel.from_pretrained(model_name="tts_de_fastpitch_singlespeaker").eval()
    # Vocoder model which takes spectrogram and produces actual audio
    self.vocoder = nemo_tts.models.HifiGanModel.from_pretrained(model_name="tts_de_slr_hifigan_ft_fastpitch_singlespeaker").eval()
  @measure_time
  def translate(self,speechfile):
    # Transcribe an audio file
    # IMPORTANT: The audio must be mono with 16Khz sampling rate
    text = self.transcription.transcribe(speechfile)
    # You should see russian text here. Let's translate it to English
    
    if text["language"]=="de":
      english_text = self.nmt_model.translate([text["text"]])
    elif text["language"]=="en":
      english_text=text["text"]
    else:
      raise NotImplementedError(f"Language: {text['language']} currently not supported")
    if self.intent_label is  None:
      self.text = self.nmt_model_de.translate(english_text)

    else:
      self.text=english_text
  # After this you should see English translation
  # Let's convert it into audio
  # A helper function which combines FastPitch and HiFiGAN to go directly from
  # text to audio
  @measure_time
  def get_intent(self):
    intents = self.intent_model.predict([self.text[0]],self.intent_label)
    intent = [f"This is a {intents[0]['labels'][0]}, I will route you to the corresponding department"]
    print(intents)
    intenti = self.nmt_model_de.translate(intent)
    return intenti,intents[0]['labels'][0]
  
  @measure_time
  def text_to_audio(self):
    parsed = self.spectrogram_generator.parse(self.text[0])
    spectrogram = self.spectrogram_generator.generate_spectrogram(tokens=parsed)
    audio = self.vocoder.convert_spectrogram_to_audio(spec=spectrogram)
    return audio.to('cpu').detach().numpy()
  @measure_time
  def process(self,speechfile,intents):
    self.intent_label = intents.split(",") if intents is not None else None
    self.translate(speechfile)
    if self.intent_label is not None:
      self.text,intent = self.get_intent()
    return self.text_to_audio(),intent