File size: 3,276 Bytes
fa188df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import io
import os

import gradio as gr
from groq import Groq
import soundfile as sf
from dotenv import load_dotenv
from gradio import ChatMessage
from deepgram import DeepgramClient, SpeakOptions


def get_transcript(audio):

    # Convert the audio to MP3 format
    audio_buffer = io.BytesIO()
    sf.write(audio_buffer, audio[1], samplerate=audio[0], format="MP3")
    audio_buffer.seek(0)

    # Groq client
    client = Groq()

    translation = client.audio.transcriptions.create(
        file=("audio.mp3", audio_buffer.read()),
        model="whisper-large-v3-turbo",
        response_format="json",
        temperature=0.0,
    )

    return translation.text


def generate_response(chat_history: list[ChatMessage]):

    # Groq client
    client = Groq()

    messages = [
        {
            "role": "system",
            "content": "You are an assistant working in a helpline center. Answer queries in short and concise sentences. Keep in mind that the output will be converted to voice, so use appropriate vocabulary.",  # noqa
        }  # noqa
    ]

    messages.extend(
        [
            {"role": message["role"], "content": message["content"]}
            for message in chat_history  # noqa
        ]
    )

    response = client.chat.completions.create(
        model="llama3-8b-8192",
        messages=messages,
    )

    return response.choices[0].message.content


def speech_synthesis(text: str):

    DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
    TEXT = {"text": text}
    FILENAME = "audio.mp3"

    try:
        deepgram = DeepgramClient(DEEPGRAM_API_KEY)

        options = SpeakOptions(
            model="aura-luna-en",
        )

        deepgram.speak.v("1").save(FILENAME, TEXT, options)

        with open(FILENAME, "rb") as audio_file:
            audio_data = audio_file.read()
        return audio_data

    except Exception as e:
        print(f"Exception: {e}")
        return None


def process_audio(audio, chat_history: list[ChatMessage]):
    # If audio is None, return None and chat history
    if audio is None:
        return None, chat_history

    transcript = get_transcript(audio)
    chat_history.append({"role": "user", "content": transcript})

    response = generate_response(chat_history)
    chat_history.append({"role": "assistant", "content": response})

    audio_data = speech_synthesis(response)

    return audio_data, chat_history


with gr.Blocks() as demo:
    gr.Markdown(
        "<h1 style='text-align: center;'> Welcome to the Audio Chatbot Demo</h1>"  # noqa
    )
    with gr.Row():
        with gr.Column():
            input_audio = gr.Audio(
                label="Input Audio", sources="microphone", type="numpy"
            )
            output_audio = gr.Audio(label="Output Audio", interactive=False)
        with gr.Column():
            chatbot = gr.Chatbot(label="Chatbot", type="messages")

    process_button = gr.Button("Process Audio")
    process_button.click(
        fn=process_audio,
        inputs=[input_audio, chatbot],
        outputs=[output_audio, chatbot],  # noqa
    )  # noqa

if __name__ == "__main__":
    load_dotenv()
    demo.launch()