Spaces:
Running
Running
import os | |
import torch | |
import whisper | |
from gtts import gTTS | |
import gradio as gr | |
from groq import Groq | |
import numpy as np | |
import io | |
# Load the Whisper model | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model = whisper.load_model("base", device=device) | |
GROQ_API_KEY ="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" | |
Client = Groq(api_key=GROQ_API_KEY) | |
# Set your Groq API key (replace with your actual key or set it in the environment) | |
os.environ["GROQ_API_KEY"] = "your_groq_api_key_here" | |
client = Groq(api_key=os.environ.get("GROQ_API_KEY")) | |
# Function to transcribe audio using Whisper | |
def transcribe(audio_data): | |
try: | |
# Convert numpy array (audio) to bytes and save it as a temporary file | |
audio_path = "temp_audio.wav" | |
with open(audio_path, "wb") as f: | |
f.write(audio_data) | |
# Transcribe the saved audio file | |
result = model.transcribe(audio_path) | |
os.remove(audio_path) # Clean up the temporary file | |
return result["text"] | |
except Exception as e: | |
return f"Error during transcription: {e}" | |
# Function to get response from Groq's LLM | |
def get_llm_response(text): | |
try: | |
chat_completion = client.chat.completions.create( | |
messages=[{"role": "user", "content": text}], | |
model="llama-3.3-70b-versatile", | |
) | |
return chat_completion.choices[0].message.content | |
except Exception as e: | |
return f"Error during LLM response generation: {e}" | |
# Function to convert text to speech | |
def text_to_speech(text): | |
try: | |
tts = gTTS(text, lang="en") | |
audio_path = "response.mp3" | |
tts.save(audio_path) | |
return audio_path | |
except Exception as e: | |
return f"Error during text-to-speech conversion: {e}" | |
# Combined function for processing audio input and generating audio output | |
def process_audio(audio_data): | |
transcription = transcribe(audio_data) | |
if "Error" in transcription: | |
return transcription, None, None | |
llm_response = get_llm_response(transcription) | |
if "Error" in llm_response: | |
return transcription, llm_response, None | |
audio_response = text_to_speech(llm_response) | |
if "Error" in audio_response: | |
return transcription, llm_response, audio_response | |
return transcription, llm_response, audio_response | |
# Build the Gradio interface | |
with gr.Blocks() as app: | |
gr.Markdown("## Real-Time Voice-to-Voice Chatbot") | |
with gr.Row(): | |
with gr.Column(): | |
audio_input = gr.Audio(type="numpy", label="Speak", interactive=True) | |
with gr.Column(): | |
transcription_output = gr.Textbox(label="Transcription (Text)", lines=2) | |
response_output = gr.Textbox(label="Response (LLM Text)", lines=2) | |
audio_output = gr.Audio(label="Response (Audio)") | |
submit_button = gr.Button("Submit") | |
# Connect the input and output components | |
submit_button.click( | |
process_audio, | |
inputs=[audio_input], | |
outputs=[transcription_output, response_output, audio_output], | |
) | |
# Launch the app | |
app.launch() | |