Voice-QA / app.py
jayesh95's picture
reformatted and removed comments
9e8179c
raw
history blame
4.58 kB
import gradio as gr
import re
import os
import torch
#Speech to text
import whisper
#QA
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
#TTS
import tempfile
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer
from typing import Optional
device = "cuda" if torch.cuda.is_available() else "cpu"
# Whisper: Speech-to-text
model = whisper.load_model("base", device = device)
#model_med = whisper.load_model("small", device = device)
model_med = whisper.load_model("medium", device = device)
#Roberta Q&A
model_name = "deepset/tinyroberta-squad2"
#model_name = "deepset/roberta-base-squad2"
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name, device = 0 if device == "cuda" else -1)
#TTS
tts_manager = ModelManager()
MAX_TXT_LEN = 100
print(model.device)
# Whisper - speech-to-text
def whisper_stt(audio):
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)
# detect the spoken language
_, probs = model.detect_language(mel)
lang = max(probs, key=probs.get)
print(f"Detected language: {max(probs, key=probs.get)}")
# decode the audio
options_transc = whisper.DecodingOptions(fp16 = False, language=lang, task='transcribe') #lang
options_transl = whisper.DecodingOptions(fp16 = False, language='en', task='translate') #lang
result_transc = whisper.decode(model_med, mel, options_transc)
result_transl = whisper.decode(model_med, mel, options_transl)
# print the recognized text
print(f"transcript is : {result_transc.text}")
print(f"translation is : {result_transl.text}")
return result_transc.text, result_transl.text, lang
# Coqui - Text-to-Speech
def tts(text: str, model_name: str):
if len(text) > MAX_TXT_LEN:
text = text[:MAX_TXT_LEN]
print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
print(text, model_name)
# download model
model_path, config_path, model_item = tts_manager.download_model(f"tts_models/{model_name}")
vocoder_name: Optional[str] = model_item["default_vocoder"]
# download vocoder
vocoder_path = None
vocoder_config_path = None
if vocoder_name is not None:
vocoder_path, vocoder_config_path, _ = tts_manager.download_model(vocoder_name)
# init synthesizer
synthesizer = Synthesizer(
model_path, config_path, None, None, vocoder_path, vocoder_config_path,
)
# synthesize
if synthesizer is None:
raise NameError("model not found")
wavs = synthesizer.tts(text)
# return output
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
synthesizer.save_wav(wavs, fp)
return fp.name
def engine(audio, context):
# Get voice query to text
transcribe, translation, lang = whisper_stt(audio)
# Get Query answer
answer = get_query_result(translation, context)
answer_speech = tts(answer, model_name= 'en/ljspeech/tacotron2-DDC_ph')
return translation, answer, answer_speech
def get_query_result(query, context):
QA_input = {
'question': query,
'context': context
}
answer = nlp(QA_input)['answer']
return answer
demo = gr.Blocks()
with demo:
gr.Markdown("<h1><center>Voice to QA</center></h1>")
gr.Markdown(
"""<center> An app to ask voice queries about a text article.</center>
"""
)
gr.Markdown(
"""Model pipeline consisting of - <br>- [**Whisper**](https://github.com/openai/whisper)for Speech-to-text, <br>- [**Tiny Roberta QA**](https://huggingface.co/deepset/tinyroberta-squad2) for Question Answering, and <br>- [**CoquiTTS**](https://github.com/coqui-ai/TTS) for Text-To-Speech.
<br> Just type/paste your text in the context field, and then ask voice questions.""")
with gr.Column():
with gr.Row():
with gr.Column():
in_audio = gr.Audio(source="microphone", type="filepath", label='Record your voice query here in English, Spanish or French for best results-')
in_context = gr.Textbox(label="Context")
b1 = gr.Button("Generate Answer")
with gr.Column():
out_query = gr.Textbox('Your Query (Transcribed)')
out_audio = gr.Audio(label = 'Voice response')
out_textbox = gr.Textbox(label="Answer")
b1.click(engine, inputs=[in_audio, in_context], outputs=[out_query, out_textbox, out_audio])
demo.launch(enable_queue=True, debug=True)