Spaces:
Sleeping
Sleeping
File size: 7,665 Bytes
88413ab 666f810 ce0d186 666f810 ed9aac5 666f810 f42dcac 88413ab cc5e2cb ed9aac5 87d6e1d 47bfd84 88413ab 2fa0634 875c690 2fa0634 875c690 2fa0634 ed9aac5 766a3e3 59c6368 766a3e3 8125172 0530dd9 766a3e3 2fa0634 88413ab 566d6f4 ce0d186 87d6e1d 4d5c730 87d6e1d 4d5c730 566d6f4 63a63e1 4d5c730 63a63e1 88413ab 766a3e3 88413ab 47bfd84 0af87d2 47bfd84 0af87d2 47bfd84 5869c97 0af87d2 666f810 5869c97 666f810 f456cb1 766a3e3 2fa0634 766a3e3 86a7a75 2fa0634 89adfe3 566d6f4 4121ad0 766a3e3 b90393d 566d6f4 766a3e3 fc801e4 766a3e3 566d6f4 4d5c730 566d6f4 766a3e3 4d5c730 ed9aac5 47bfd84 89adfe3 ed9aac5 875c690 ed9aac5 89adfe3 ed9aac5 47bfd84 ed9aac5 1503067 766a3e3 4d5c730 766a3e3 738bf68 ed9aac5 89adfe3 ed9aac5 b90393d 86a7a75 87d6e1d ed9aac5 da91c46 ed9aac5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
# imports
import os
import sys
os.system("pip install git+https://github.com/openai/whisper.git")
import gradio as gr
import whisper
# the model we are using for ASR, options are small, medium, large and largev2 (large and largev2 don't fit on huggingface cpu)
model = whisper.load_model("medium")
import torch
# A table to look up all the languages
language_id_lookup = {
"Arabic" : "ar",
"English" : "en",
"Chinese" : "zh",
"Spanish" : "es",
"Russian" : "ru",
"French" : "fr",
}
# load mRASP2
os.system("git clone https://github.com/PANXiao1994/mRASP2.git")
os.system('mv -n mRASP2/* ./')
os.system("rm -rf mRASP2")
os.system("pip install -r requirements.txt")
os.system("git clone https://github.com/pytorch/fairseq")
os.system("cd fairseq; pip install ./; cd ..")
model_name = "12e12d_last.pt"
# os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/acl2021/mrasp2/" + model_name)
os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/acl2021/mrasp2/bpe_vocab")
os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/emnlp2020/mrasp/pretrain/dataset/codes.bpe.32000")
# load tts
# os.system("git clone https://github.com/Kyubyong/g2pC.git")
# os.system("cd g2pC; sed -i 's/pkuseg/spacy_pkuseg/g' setup.py; \
# sed -i 's/import pkuseg/import spacy_pkuseg as pkuseg/g' g2pc/g2pc.py; \
# sed -i 's/package_data={/# package_data={/g' setup.py; \
# pip install ./; cd ..")
# os.system("git clone https://github.com/DigitalPhonetics/IMS-Toucan.git")
# sys.path.append('./IMS-Toucan')
# os.system("cd IMS-Toucan; pip install --no-cache-dir -r requirements.txt")
# os.system("python run_model_downloader.py; cd ..")
# from InferenceInterfaces.PortaSpeechInterface import PortaSpeechInterface
# cwd = os.getcwd()
# os.chdir('./IMS-Toucan')
# tts = PortaSpeechInterface(device='cpu', tts_model_path='Meta')
# os.chdir(cwd)
# azure tts
# os.system("2d1847f4151f4f94ae06d0b620533936")
# os.system("eastus")
lang2voice = {
"zh": "zh-CN-XiaoxiaoNeural",
"ar": "ar-EG-SalmaNeural",
"en": "en-US-JennyNeural",
"es": "es-ES-AbrilNeural",
"ru": "ru-RU-DariyaNeural",
"fr": "fr-FR-AlainNeural",
}
# The predict function. audio, language and mic_audio are all parameters directly passed by gradio
# which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
# gr.outputs[] block will specify the output type.
def predict(audio, src_language, tgt_language, mic_audio=None):
# checks if mic_audio is used, otherwise feeds model uploaded audio
if mic_audio is not None:
input_audio = mic_audio
elif audio is not None:
input_audio = audio
else:
return "(please provide audio)"
# Uses the model's preprocessing methods to preprocess audio
audio = whisper.load_audio(input_audio)
audio = whisper.pad_or_trim(audio)
# Calculates the mel frequency spectogram
mel = whisper.log_mel_spectrogram(audio).to(model.device)
# if model is supposed to detect language, set outLanguage to None
# otherwise set to specified language
if(src_language == "Detect Language"):
src_language = None
else:
src_language = language_id_lookup[src_language.split()[0]]
tgt_language = language_id_lookup[tgt_language.split()[0]]
# Runs the audio through the whisper model and gets the DecodingResult object, which has the features:
# audio_features (Tensor), language, language_probs, tokens, text, avg_logprob, no_speech_prob, temperature, compression_ratio
# asr
options = whisper.DecodingOptions(fp16 = True, language = src_language)
result = whisper.decode(model, mel, options)
if src_language is None:
src_language = result.language
transcript = result.text
# mt
with open("input." + src_language, 'w') as w:
w.write(result.text)
with open("input." + tgt_language, 'w') as w:
w.write('LANG_TOK_' + src_language.upper())
os.system("python fairseq/fairseq_cli/preprocess.py --dataset-impl raw \
--srcdict bpe_vocab --tgtdict bpe_vocab --testpref input -s {} -t {}".format( \
src_language, tgt_language))
os.system("python fairseq/fairseq_cli/interactive.py ./data-bin \
--user-dir mcolt \
-s zh \
-t en \
--skip-invalid-size-inputs-valid-test \
--path {} \
--max-tokens 1024 \
--task translation_w_langtok \
--lang-prefix-tok \"LANG_TOK_{}\" \
--max-source-positions 1024 \
--max-target-positions 1024 \
--nbest 1 \
--bpe subword_nmt \
--bpe-codes codes.bpe.32000 \
--post-process --tokenizer moses \
--input input.{} | grep -E '[D]-[0-9]+' > output".format(
model_name, tgt_language.upper(), src_language))
with open("output", 'r') as r:
translation = (' '.join(r.readline().split(' ')[3:])).strip()
# tts
# tts.set_language(tgt_language)
# tts.read_to_file(text_list=[translation], file_location='output.wav')
# azure tts
import azure.cognitiveservices.speech as speechsdk
speech_key = "2d1847f4151f4f94ae06d0b620533936"
service_region = "eastus"
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
# Note: the voice setting will not overwrite the voice element in input SSML.
speech_config.speech_synthesis_voice_name = lang2voice[tgt_language]
audio_config = speechsdk.audio.AudioOutputConfig(filename="output.wav")
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
speech_synthesizer.speak_text(translation)
# Returns the text
return transcript, translation, "output.wav"
title = "Demo for Whisper (ASR) -> Something -> IMS Toucan (TTS)"
description = """
<b>How to use:</b> Upload an audio file or record using the microphone. The audio is into the whisper model developed by openai.
The output is the text transcription of the audio in the language you inputted. If you asked the model to detect a language, it will
tell you what language it detected.
"""
# The gradio interface
gr.Interface(
fn=predict,
inputs=[
gr.Audio(label="Upload Speech", source="upload", type="filepath"),
gr.inputs.Dropdown(['Arabic',
'Chinese',
'English',
'Spanish',
'Russian',
'French',
'Detect Language'], type="value", default='English', label="Select the language of input"),
gr.inputs.Dropdown(['Arabic',
'Chinese',
'English',
'Spanish',
'Russian',
'French',
'Detect Language'], type="value", default='English', label="Select the language of output"),
gr.Audio(label="Record Speech", source="microphone", type="filepath"),
],
# To change to output audio, replace the outputs line with
# outputs=gr.outputs.Audio(type="numpy", label=None)
outputs=[
gr.Text(label="Transcript"),
gr.Text(label="Translation"),
gr.Audio(label="Translation Speech")
],
title=title,
description=description,
).launch() |