import io import os import gradio as gr from groq import Groq import soundfile as sf from dotenv import load_dotenv from gradio import ChatMessage from deepgram import DeepgramClient, SpeakOptions def get_transcript(audio): # Convert the audio to MP3 format audio_buffer = io.BytesIO() sf.write(audio_buffer, audio[1], samplerate=audio[0], format="MP3") audio_buffer.seek(0) # Groq client client = Groq() translation = client.audio.transcriptions.create( file=("audio.mp3", audio_buffer.read()), model="whisper-large-v3-turbo", response_format="json", temperature=0.0, ) return translation.text def generate_response(chat_history: list[ChatMessage]): # Groq client client = Groq() messages = [ { "role": "system", "content": "You are an assistant working in a helpline center. Answer queries in short and concise sentences. Keep in mind that the output will be converted to voice, so use appropriate vocabulary.", # noqa } # noqa ] messages.extend( [ {"role": message["role"], "content": message["content"]} for message in chat_history # noqa ] ) response = client.chat.completions.create( model="llama3-8b-8192", messages=messages, ) return response.choices[0].message.content def speech_synthesis(text: str): DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY") TEXT = {"text": text} FILENAME = "audio.mp3" try: deepgram = DeepgramClient(DEEPGRAM_API_KEY) options = SpeakOptions( model="aura-luna-en", ) deepgram.speak.v("1").save(FILENAME, TEXT, options) with open(FILENAME, "rb") as audio_file: audio_data = audio_file.read() return audio_data except Exception as e: print(f"Exception: {e}") return None def process_audio(audio, chat_history: list[ChatMessage]): # If audio is None, return None and chat history if audio is None: return None, chat_history transcript = get_transcript(audio) chat_history.append({"role": "user", "content": transcript}) response = generate_response(chat_history) chat_history.append({"role": "assistant", "content": response}) audio_data = speech_synthesis(response) return audio_data, chat_history with gr.Blocks() as demo: gr.Markdown( "

Welcome to the Audio Chatbot Demo

" # noqa ) with gr.Row(): with gr.Column(): input_audio = gr.Audio( label="Input Audio", sources="microphone", type="numpy" ) output_audio = gr.Audio(label="Output Audio", interactive=False) with gr.Column(): chatbot = gr.Chatbot(label="Chatbot", type="messages") process_button = gr.Button("Process Audio") process_button.click( fn=process_audio, inputs=[input_audio, chatbot], outputs=[output_audio, chatbot], # noqa ) # noqa if __name__ == "__main__": load_dotenv() demo.launch()