File size: 4,577 Bytes
ddd3649 ee54e23 ddd3649 ed6b579 ee54e23 ddd3649 9e8179c ddd3649 b95859f ddd3649 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import gradio as gr
import re
import os
import torch
#Speech to text
import whisper
#QA
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
#TTS
import tempfile
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer
from typing import Optional
device = "cuda" if torch.cuda.is_available() else "cpu"
# Whisper: Speech-to-text
model = whisper.load_model("base", device = device)
#model_med = whisper.load_model("small", device = device)
model_med = whisper.load_model("medium", device = device)
#Roberta Q&A
model_name = "deepset/tinyroberta-squad2"
#model_name = "deepset/roberta-base-squad2"
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name, device = 0 if device == "cuda" else -1)
#TTS
tts_manager = ModelManager()
MAX_TXT_LEN = 100
print(model.device)
# Whisper - speech-to-text
def whisper_stt(audio):
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)
# detect the spoken language
_, probs = model.detect_language(mel)
lang = max(probs, key=probs.get)
print(f"Detected language: {max(probs, key=probs.get)}")
# decode the audio
options_transc = whisper.DecodingOptions(fp16 = False, language=lang, task='transcribe') #lang
options_transl = whisper.DecodingOptions(fp16 = False, language='en', task='translate') #lang
result_transc = whisper.decode(model_med, mel, options_transc)
result_transl = whisper.decode(model_med, mel, options_transl)
# print the recognized text
print(f"transcript is : {result_transc.text}")
print(f"translation is : {result_transl.text}")
return result_transc.text, result_transl.text, lang
# Coqui - Text-to-Speech
def tts(text: str, model_name: str):
if len(text) > MAX_TXT_LEN:
text = text[:MAX_TXT_LEN]
print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
print(text, model_name)
# download model
model_path, config_path, model_item = tts_manager.download_model(f"tts_models/{model_name}")
vocoder_name: Optional[str] = model_item["default_vocoder"]
# download vocoder
vocoder_path = None
vocoder_config_path = None
if vocoder_name is not None:
vocoder_path, vocoder_config_path, _ = tts_manager.download_model(vocoder_name)
# init synthesizer
synthesizer = Synthesizer(
model_path, config_path, None, None, vocoder_path, vocoder_config_path,
)
# synthesize
if synthesizer is None:
raise NameError("model not found")
wavs = synthesizer.tts(text)
# return output
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
synthesizer.save_wav(wavs, fp)
return fp.name
def engine(audio, context):
# Get voice query to text
transcribe, translation, lang = whisper_stt(audio)
# Get Query answer
answer = get_query_result(translation, context)
answer_speech = tts(answer, model_name= 'en/ljspeech/tacotron2-DDC_ph')
return translation, answer, answer_speech
def get_query_result(query, context):
QA_input = {
'question': query,
'context': context
}
answer = nlp(QA_input)['answer']
return answer
demo = gr.Blocks()
with demo:
gr.Markdown("<h1><center>Voice to QA</center></h1>")
gr.Markdown(
"""<center> An app to ask voice queries about a text article.</center>
"""
)
gr.Markdown(
"""Model pipeline consisting of - <br>- [**Whisper**](https://github.com/openai/whisper)for Speech-to-text, <br>- [**Tiny Roberta QA**](https://huggingface.co/deepset/tinyroberta-squad2) for Question Answering, and <br>- [**CoquiTTS**](https://github.com/coqui-ai/TTS) for Text-To-Speech.
<br> Just type/paste your text in the context field, and then ask voice questions.""")
with gr.Column():
with gr.Row():
with gr.Column():
in_audio = gr.Audio(source="microphone", type="filepath", label='Record your voice query here in English, Spanish or French for best results-')
in_context = gr.Textbox(label="Context")
b1 = gr.Button("Generate Answer")
with gr.Column():
out_query = gr.Textbox('Your Query (Transcribed)')
out_audio = gr.Audio(label = 'Voice response')
out_textbox = gr.Textbox(label="Answer")
b1.click(engine, inputs=[in_audio, in_context], outputs=[out_query, out_textbox, out_audio])
demo.launch(enable_queue=True, debug=True) |