VoiceDemo / app.py
rahul-appu's picture
Upload 2 files
fa188df verified
import io
import os
import gradio as gr
from groq import Groq
import soundfile as sf
from dotenv import load_dotenv
from gradio import ChatMessage
from deepgram import DeepgramClient, SpeakOptions
def get_transcript(audio):
# Convert the audio to MP3 format
audio_buffer = io.BytesIO()
sf.write(audio_buffer, audio[1], samplerate=audio[0], format="MP3")
audio_buffer.seek(0)
# Groq client
client = Groq()
translation = client.audio.transcriptions.create(
file=("audio.mp3", audio_buffer.read()),
model="whisper-large-v3-turbo",
response_format="json",
temperature=0.0,
)
return translation.text
def generate_response(chat_history: list[ChatMessage]):
# Groq client
client = Groq()
messages = [
{
"role": "system",
"content": "You are an assistant working in a helpline center. Answer queries in short and concise sentences. Keep in mind that the output will be converted to voice, so use appropriate vocabulary.", # noqa
} # noqa
]
messages.extend(
[
{"role": message["role"], "content": message["content"]}
for message in chat_history # noqa
]
)
response = client.chat.completions.create(
model="llama3-8b-8192",
messages=messages,
)
return response.choices[0].message.content
def speech_synthesis(text: str):
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
TEXT = {"text": text}
FILENAME = "audio.mp3"
try:
deepgram = DeepgramClient(DEEPGRAM_API_KEY)
options = SpeakOptions(
model="aura-luna-en",
)
deepgram.speak.v("1").save(FILENAME, TEXT, options)
with open(FILENAME, "rb") as audio_file:
audio_data = audio_file.read()
return audio_data
except Exception as e:
print(f"Exception: {e}")
return None
def process_audio(audio, chat_history: list[ChatMessage]):
# If audio is None, return None and chat history
if audio is None:
return None, chat_history
transcript = get_transcript(audio)
chat_history.append({"role": "user", "content": transcript})
response = generate_response(chat_history)
chat_history.append({"role": "assistant", "content": response})
audio_data = speech_synthesis(response)
return audio_data, chat_history
with gr.Blocks() as demo:
gr.Markdown(
"<h1 style='text-align: center;'> Welcome to the Audio Chatbot Demo</h1>" # noqa
)
with gr.Row():
with gr.Column():
input_audio = gr.Audio(
label="Input Audio", sources="microphone", type="numpy"
)
output_audio = gr.Audio(label="Output Audio", interactive=False)
with gr.Column():
chatbot = gr.Chatbot(label="Chatbot", type="messages")
process_button = gr.Button("Process Audio")
process_button.click(
fn=process_audio,
inputs=[input_audio, chatbot],
outputs=[output_audio, chatbot], # noqa
) # noqa
if __name__ == "__main__":
load_dotenv()
demo.launch()