File size: 5,126 Bytes
a7c2f52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import gradio as gr
import soundfile
import time
import torch
import scipy.io.wavfile
from espnet2.bin.tts_inference import Text2Speech
from espnet2.utils.types import str_or_none
from espnet2.bin.asr_inference import Speech2Text
from subprocess import call
import os
from espnet_model_zoo.downloader import ModelDownloader
# print(a1)
# exit()
# exit()
# tagen = 'kan-bayashi/ljspeech_vits' 
# vocoder_tagen = "none" 

speech2text_slurp = Speech2Text.from_pretrained(
    asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
    asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
    # Decoding parameters are not included in the model file
    lang_prompt_token="<|en|> <|ner|> <|SLURP|>",
    prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
    nbest=1
)

speech2text_fsc = Speech2Text.from_pretrained(
    asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
    asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
    # Decoding parameters are not included in the model file
    lang_prompt_token="<|en|> <|ic|> <|fsc|>",
    prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
    nbest=1
)

speech2text_grabo = Speech2Text.from_pretrained(
    asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
    asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
    # Decoding parameters are not included in the model file
    lang_prompt_token="<|nl|> <|scr|> <|grabo_scr|>",
    prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
    nbest=1
)

def inference(wav,data):
  with torch.no_grad():
      if data == "english_slurp":
          speech, rate = soundfile.read(wav.name)
          nbests = speech2text_slurp(speech)
          text, *_ = nbests[0]
          # intent=text.split(" ")[0]
          # scenario=intent.split("_")[0]
          # action=intent.split("_")[1]
          # text="{scenario: "+scenario+", action: "+action+"}"
      elif data == "english_fsc":
          print(wav.name)
          speech, rate = soundfile.read(wav.name)
          print(speech.shape)
          if len(speech.shape)==2:
            speech=speech[:,0]
            # soundfile.write("store_file.wav", speech, rate, subtype='FLOAT')
          print(speech.shape)
          nbests = speech2text_fsc(speech)
          text, *_ = nbests[0]
          # intent=text.split(" ")[0]
          # action=intent.split("_")[0]
          # objects=intent.split("_")[1]
          # location=intent.split("_")[2]
          # text="{action: "+action+", object: "+objects+", location: "+location+"}"
      # elif data == "english_snips":
      #     print(wav.name)
      #     speech, rate = soundfile.read(wav.name)
      #     nbests = speech2text_snips(speech)
      #     text, *_ = nbests[0]
      elif data == "dutch":
          print(wav.name)
          speech, rate = soundfile.read(wav.name)
          nbests = speech2text_grabo(speech)
          text, *_ = nbests[0]
          # intent=text.split(" ")[0]
          # action=intent.split("_")[0]
          # objects=intent.split("_")[1]
          # location=intent.split("_")[2]
          # text="{action: "+action+", object: "+objects+", location: "+location+"}"

      # if lang == "chinese":
      #     wav = text2speechch(text)["wav"]
      #     scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
      # if lang == "japanese":
      #     wav = text2speechjp(text)["wav"]
      #     scipy.io.wavfile.write("out.wav",text2speechjp.fs , wav.view(-1).cpu().numpy())
  return  text

title = "UniverSLU"
description = "Gradio demo for UniverSLU: Universal Spoken Language Understanding for Diverse Tasks with Natural Language Instructions. To use it, simply record your audio or click one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"

examples=[['audio_slurp.flac',"english_slurp"],['audio_fsc.wav',"english_fsc"],['audio_grabo.wav',"dutch"]]

# gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
gr.Interface(
    inference, 
    [gr.inputs.Audio(label="input audio",source = "microphone", type="file"),gr.inputs.Radio(choices=["english_slurp","english_fsc","dutch_scd"], type="value", default="english_fsc", label="Task")], 
    gr.outputs.Textbox(type="str", label="Output"),
    title=title,
    description=description,
    article=article,
    enable_queue=True,
    examples=examples
    ).launch(debug=True)