Spaces:
Runtime error
Runtime error
File size: 4,142 Bytes
65191fa d146e4d 65191fa d8d2ace 65191fa d8d2ace 65191fa d8d2ace 65191fa d8d2ace 65191fa d8d2ace 65191fa d8d2ace 65191fa d8d2ace 65191fa d8d2ace 65191fa ef7af0d d8d2ace daa9ddd d8d2ace daa9ddd d8d2ace 65191fa 1b3a760 65191fa d8d2ace 65191fa daa9ddd d8d2ace 65191fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import gradio as gr
import soundfile
import time
import torch
import scipy.io.wavfile
from espnet2.bin.tts_inference import Text2Speech
from espnet2.utils.types import str_or_none
from espnet2.bin.asr_inference import Speech2Text
# tagen = 'kan-bayashi/ljspeech_vits'
# vocoder_tagen = "none"
speech2text = Speech2Text.from_pretrained(
asr_train_config="slurp/config.yaml",
asr_model_file="slurp/valid.acc.ave_10best.pth",
# Decoding parameters are not included in the model file
nbest=1
)
# Confirm the sampling rate is equal to that of the training corpus.
# If not, you need to resample the audio data before inputting to speech2text
# speech, rate = soundfile.read("audio--1504190171-headset.flac")
# nbests = speech2text(speech)
# text, *_ = nbests[0]
# print(text)
# exit()
# text2speechen = Text2Speech.from_pretrained(
# model_tag=str_or_none(tagen),
# vocoder_tag=str_or_none(vocoder_tagen),
# device="cpu",
# # Only for Tacotron 2 & Transformer
# threshold=0.5,
# # Only for Tacotron 2
# minlenratio=0.0,
# maxlenratio=10.0,
# use_att_constraint=False,
# backward_window=1,
# forward_window=3,
# # Only for FastSpeech & FastSpeech2 & VITS
# speed_control_alpha=1.0,
# # Only for VITS
# noise_scale=0.333,
# noise_scale_dur=0.333,
# )
# tagjp = 'kan-bayashi/jsut_full_band_vits_prosody'
# vocoder_tagjp = 'none'
# text2speechjp = Text2Speech.from_pretrained(
# model_tag=str_or_none(tagjp),
# vocoder_tag=str_or_none(vocoder_tagjp),
# device="cpu",
# # Only for Tacotron 2 & Transformer
# threshold=0.5,
# # Only for Tacotron 2
# minlenratio=0.0,
# maxlenratio=10.0,
# use_att_constraint=False,
# backward_window=1,
# forward_window=3,
# # Only for FastSpeech & FastSpeech2 & VITS
# speed_control_alpha=1.0,
# # Only for VITS
# noise_scale=0.333,
# noise_scale_dur=0.333,
# )
# tagch = 'kan-bayashi/csmsc_full_band_vits'
# vocoder_tagch = "none"
# text2speechch = Text2Speech.from_pretrained(
# model_tag=str_or_none(tagch),
# vocoder_tag=str_or_none(vocoder_tagch),
# device="cpu",
# # Only for Tacotron 2 & Transformer
# threshold=0.5,
# # Only for Tacotron 2
# minlenratio=0.0,
# maxlenratio=10.0,
# use_att_constraint=False,
# backward_window=1,
# forward_window=3,
# # Only for FastSpeech & FastSpeech2 & VITS
# speed_control_alpha=1.0,
# # Only for VITS
# noise_scale=0.333,
# noise_scale_dur=0.333,
# )
def inference(wav,lang):
with torch.no_grad():
if lang == "english":
speech, rate = soundfile.read(wav.name)
nbests = speech2text(speech)
text, *_ = nbests[0]
intent=text.split(" ")[0]
scenario=intent.split("_")[0]
action=intent.split("_")[1]
text="{scenario: "+scenario+", action: "+action"}"
# if lang == "chinese":
# wav = text2speechch(text)["wav"]
# scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
# if lang == "japanese":
# wav = text2speechjp(text)["wav"]
# scipy.io.wavfile.write("out.wav",text2speechjp.fs , wav.view(-1).cpu().numpy())
return text
title = "ESPnet2-SLU"
description = "Gradio demo for ESPnet2-SLU: Advancing Spoken Language Understanding through ESPnet. To use it, simply record your audio. Read more at the links below."
article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
examples=[['audio_slurp.flac',"english"]]
# gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
gr.Interface(
inference,
[gr.inputs.Audio(label="input audio",source = "microphone", type="file"),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")],
gr.outputs.Textbox(type="str", label="Output"),
title=title,
description=description,
article=article,
enable_queue=True,
examples=examples
).launch(debug=True)
|