Spaces:
Sleeping
Sleeping
import io | |
import os | |
import gradio as gr | |
from groq import Groq | |
import soundfile as sf | |
from dotenv import load_dotenv | |
from gradio import ChatMessage | |
from deepgram import DeepgramClient, SpeakOptions | |
def get_transcript(audio): | |
# Convert the audio to MP3 format | |
audio_buffer = io.BytesIO() | |
sf.write(audio_buffer, audio[1], samplerate=audio[0], format="MP3") | |
audio_buffer.seek(0) | |
# Groq client | |
client = Groq() | |
translation = client.audio.transcriptions.create( | |
file=("audio.mp3", audio_buffer.read()), | |
model="whisper-large-v3-turbo", | |
response_format="json", | |
temperature=0.0, | |
) | |
return translation.text | |
def generate_response(chat_history: list[ChatMessage]): | |
# Groq client | |
client = Groq() | |
messages = [ | |
{ | |
"role": "system", | |
"content": "You are an assistant working in a helpline center. Answer queries in short and concise sentences. Keep in mind that the output will be converted to voice, so use appropriate vocabulary.", # noqa | |
} # noqa | |
] | |
messages.extend( | |
[ | |
{"role": message["role"], "content": message["content"]} | |
for message in chat_history # noqa | |
] | |
) | |
response = client.chat.completions.create( | |
model="llama3-8b-8192", | |
messages=messages, | |
) | |
return response.choices[0].message.content | |
def speech_synthesis(text: str): | |
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY") | |
TEXT = {"text": text} | |
FILENAME = "audio.mp3" | |
try: | |
deepgram = DeepgramClient(DEEPGRAM_API_KEY) | |
options = SpeakOptions( | |
model="aura-luna-en", | |
) | |
deepgram.speak.v("1").save(FILENAME, TEXT, options) | |
with open(FILENAME, "rb") as audio_file: | |
audio_data = audio_file.read() | |
return audio_data | |
except Exception as e: | |
print(f"Exception: {e}") | |
return None | |
def process_audio(audio, chat_history: list[ChatMessage]): | |
# If audio is None, return None and chat history | |
if audio is None: | |
return None, chat_history | |
transcript = get_transcript(audio) | |
chat_history.append({"role": "user", "content": transcript}) | |
response = generate_response(chat_history) | |
chat_history.append({"role": "assistant", "content": response}) | |
audio_data = speech_synthesis(response) | |
return audio_data, chat_history | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
"<h1 style='text-align: center;'> Welcome to the Audio Chatbot Demo</h1>" # noqa | |
) | |
with gr.Row(): | |
with gr.Column(): | |
input_audio = gr.Audio( | |
label="Input Audio", sources="microphone", type="numpy" | |
) | |
output_audio = gr.Audio(label="Output Audio", interactive=False) | |
with gr.Column(): | |
chatbot = gr.Chatbot(label="Chatbot", type="messages") | |
process_button = gr.Button("Process Audio") | |
process_button.click( | |
fn=process_audio, | |
inputs=[input_audio, chatbot], | |
outputs=[output_audio, chatbot], # noqa | |
) # noqa | |
if __name__ == "__main__": | |
load_dotenv() | |
demo.launch() | |