import os import torch import whisper from gtts import gTTS import gradio as gr from groq import Groq import numpy as np import io # Load the Whisper model device = "cuda" if torch.cuda.is_available() else "cpu" model = whisper.load_model("base", device=device) GROQ_API_KEY ="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" Client = Groq(api_key=GROQ_API_KEY) # Set your Groq API key (replace with your actual key or set it in the environment) os.environ["GROQ_API_KEY"] = "your_groq_api_key_here" client = Groq(api_key=os.environ.get("GROQ_API_KEY")) # Function to transcribe audio using Whisper def transcribe(audio_data): try: # Convert numpy array (audio) to bytes and save it as a temporary file audio_path = "temp_audio.wav" with open(audio_path, "wb") as f: f.write(audio_data) # Transcribe the saved audio file result = model.transcribe(audio_path) os.remove(audio_path) # Clean up the temporary file return result["text"] except Exception as e: return f"Error during transcription: {e}" # Function to get response from Groq's LLM def get_llm_response(text): try: chat_completion = client.chat.completions.create( messages=[{"role": "user", "content": text}], model="llama-3.3-70b-versatile", ) return chat_completion.choices[0].message.content except Exception as e: return f"Error during LLM response generation: {e}" # Function to convert text to speech def text_to_speech(text): try: tts = gTTS(text, lang="en") audio_path = "response.mp3" tts.save(audio_path) return audio_path except Exception as e: return f"Error during text-to-speech conversion: {e}" # Combined function for processing audio input and generating audio output def process_audio(audio_data): transcription = transcribe(audio_data) if "Error" in transcription: return transcription, None, None llm_response = get_llm_response(transcription) if "Error" in llm_response: return transcription, llm_response, None audio_response = text_to_speech(llm_response) if "Error" in audio_response: return transcription, llm_response, audio_response return transcription, llm_response, audio_response # Build the Gradio interface with gr.Blocks() as app: gr.Markdown("## Real-Time Voice-to-Voice Chatbot") with gr.Row(): with gr.Column(): audio_input = gr.Audio(type="numpy", label="Speak", interactive=True) with gr.Column(): transcription_output = gr.Textbox(label="Transcription (Text)", lines=2) response_output = gr.Textbox(label="Response (LLM Text)", lines=2) audio_output = gr.Audio(label="Response (Audio)") submit_button = gr.Button("Submit") # Connect the input and output components submit_button.click( process_audio, inputs=[audio_input], outputs=[transcription_output, response_output, audio_output], ) # Launch the app app.launch()