Spaces:
Sleeping
Sleeping
File size: 3,276 Bytes
fa188df |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import io
import os
import gradio as gr
from groq import Groq
import soundfile as sf
from dotenv import load_dotenv
from gradio import ChatMessage
from deepgram import DeepgramClient, SpeakOptions
def get_transcript(audio):
# Convert the audio to MP3 format
audio_buffer = io.BytesIO()
sf.write(audio_buffer, audio[1], samplerate=audio[0], format="MP3")
audio_buffer.seek(0)
# Groq client
client = Groq()
translation = client.audio.transcriptions.create(
file=("audio.mp3", audio_buffer.read()),
model="whisper-large-v3-turbo",
response_format="json",
temperature=0.0,
)
return translation.text
def generate_response(chat_history: list[ChatMessage]):
# Groq client
client = Groq()
messages = [
{
"role": "system",
"content": "You are an assistant working in a helpline center. Answer queries in short and concise sentences. Keep in mind that the output will be converted to voice, so use appropriate vocabulary.", # noqa
} # noqa
]
messages.extend(
[
{"role": message["role"], "content": message["content"]}
for message in chat_history # noqa
]
)
response = client.chat.completions.create(
model="llama3-8b-8192",
messages=messages,
)
return response.choices[0].message.content
def speech_synthesis(text: str):
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
TEXT = {"text": text}
FILENAME = "audio.mp3"
try:
deepgram = DeepgramClient(DEEPGRAM_API_KEY)
options = SpeakOptions(
model="aura-luna-en",
)
deepgram.speak.v("1").save(FILENAME, TEXT, options)
with open(FILENAME, "rb") as audio_file:
audio_data = audio_file.read()
return audio_data
except Exception as e:
print(f"Exception: {e}")
return None
def process_audio(audio, chat_history: list[ChatMessage]):
# If audio is None, return None and chat history
if audio is None:
return None, chat_history
transcript = get_transcript(audio)
chat_history.append({"role": "user", "content": transcript})
response = generate_response(chat_history)
chat_history.append({"role": "assistant", "content": response})
audio_data = speech_synthesis(response)
return audio_data, chat_history
with gr.Blocks() as demo:
gr.Markdown(
"<h1 style='text-align: center;'> Welcome to the Audio Chatbot Demo</h1>" # noqa
)
with gr.Row():
with gr.Column():
input_audio = gr.Audio(
label="Input Audio", sources="microphone", type="numpy"
)
output_audio = gr.Audio(label="Output Audio", interactive=False)
with gr.Column():
chatbot = gr.Chatbot(label="Chatbot", type="messages")
process_button = gr.Button("Process Audio")
process_button.click(
fn=process_audio,
inputs=[input_audio, chatbot],
outputs=[output_audio, chatbot], # noqa
) # noqa
if __name__ == "__main__":
load_dotenv()
demo.launch()
|