Welcome to the Audio Chatbot Demo

import io
import os

import gradio as gr
from groq import Groq
import soundfile as sf
from dotenv import load_dotenv
from gradio import ChatMessage
from deepgram import DeepgramClient, SpeakOptions


def get_transcript(audio):

    # Convert the audio to MP3 format
    audio_buffer = io.BytesIO()
    sf.write(audio_buffer, audio[1], samplerate=audio[0], format="MP3")
    audio_buffer.seek(0)

    # Groq client
    client = Groq()

    translation = client.audio.transcriptions.create(
        file=("audio.mp3", audio_buffer.read()),
        model="whisper-large-v3-turbo",
        response_format="json",
        temperature=0.0,
    )

    return translation.text


def generate_response(chat_history: list[ChatMessage]):

    # Groq client
    client = Groq()

    messages = [
        {
            "role": "system",
            "content": "You are an assistant working in a helpline center. Answer queries in short and concise sentences. Keep in mind that the output will be converted to voice, so use appropriate vocabulary.",  # noqa
        }  # noqa
    ]

    messages.extend(
        [
            {"role": message["role"], "content": message["content"]}
            for message in chat_history  # noqa
        ]
    )

    response = client.chat.completions.create(
        model="llama3-8b-8192",
        messages=messages,
    )

    return response.choices[0].message.content


def speech_synthesis(text: str):

    DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
    TEXT = {"text": text}
    FILENAME = "audio.mp3"

    try:
        deepgram = DeepgramClient(DEEPGRAM_API_KEY)

        options = SpeakOptions(
            model="aura-luna-en",
        )

        deepgram.speak.v("1").save(FILENAME, TEXT, options)

        with open(FILENAME, "rb") as audio_file:
            audio_data = audio_file.read()
        return audio_data

    except Exception as e:
        print(f"Exception: {e}")
        return None


def process_audio(audio, chat_history: list[ChatMessage]):
    # If audio is None, return None and chat history
    if audio is None:
        return None, chat_history

    transcript = get_transcript(audio)
    chat_history.append({"role": "user", "content": transcript})

    response = generate_response(chat_history)
    chat_history.append({"role": "assistant", "content": response})

    audio_data = speech_synthesis(response)

    return audio_data, chat_history


with gr.Blocks() as demo:
    gr.Markdown(
        "<h1 style='text-align: center;'> Welcome to the Audio Chatbot Demo</h1>"  # noqa
    )
    with gr.Row():
        with gr.Column():
            input_audio = gr.Audio(
                label="Input Audio", sources="microphone", type="numpy"
            )
            output_audio = gr.Audio(label="Output Audio", interactive=False)
        with gr.Column():
            chatbot = gr.Chatbot(label="Chatbot", type="messages")

    process_button = gr.Button("Process Audio")
    process_button.click(
        fn=process_audio,
        inputs=[input_audio, chatbot],
        outputs=[output_audio, chatbot],  # noqa
    )  # noqa

if __name__ == "__main__":
    load_dotenv()
    demo.launch()