voice-to-voice / app.py
Safwanahmad619's picture
Update app.py
9fc95ea verified
import gradio as gr
import whisper
from gtts import gTTS
from groq import Groq
import os
import numpy as np
import soundfile as sf
import logging
# Configure logging
logging.basicConfig(level=logging.DEBUG)
# Initialize the Groq API key from environment variables
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_API_KEY:
raise RuntimeError("GROQ_API_KEY environment variable not set.")
# Initialize Whisper model (No API key required)
try:
whisper_model = whisper.load_model("base")
logging.info("Whisper model loaded successfully.")
except Exception as e:
raise RuntimeError(f"Error loading Whisper model: {e}")
# Initialize Groq client (API key required for Groq API)
try:
client = Groq(
api_key=GROQ_API_KEY # Use the API key from the environment variable
)
logging.info("Groq client initialized successfully.")
except Exception as e:
raise RuntimeError(f"Error initializing Groq client: {e}")
# Function to transcribe audio using Whisper
def transcribe_audio(audio):
try:
# Load audio file with soundfile
logging.debug(f"Loading audio file: {audio}")
audio_data, sample_rate = sf.read(audio, dtype='float32') # Ensure dtype is float32
logging.debug(f"Audio loaded with sample rate: {sample_rate}, data shape: {audio_data.shape}")
# Whisper expects a specific sample rate
if sample_rate != 16000:
logging.debug(f"Resampling audio from {sample_rate} to 16000 Hz")
# Resample audio data to 16000 Hz
num_samples = int(len(audio_data) * (16000 / sample_rate))
audio_data_resampled = np.interp(np.linspace(0, len(audio_data), num_samples),
np.arange(len(audio_data)),
audio_data)
audio_data = audio_data_resampled.astype(np.float32) # Ensure dtype is float32
sample_rate = 16000
# Perform the transcription
result = whisper_model.transcribe(audio_data)
logging.debug(f"Transcription result: {result['text']}")
return result['text']
except Exception as e:
logging.error(f"Error during transcription: {e}")
return f"Error during transcription: {e}"
# Function to get response from LLaMA model using Groq API
def get_response(text):
try:
logging.debug(f"Sending request to Groq API with text: {text}")
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": text, # Using the transcribed text as input
}
],
model="llama3-8b-8192", # Ensure the correct model is used
)
# Extract and return the model's response
response_text = chat_completion.choices[0].message.content
logging.debug(f"Received response from Groq API: {response_text}")
return response_text
except Exception as e:
logging.error(f"Error during model response generation: {e}")
return f"Error during model response generation: {e}"
# Function to convert text to speech using gTTS
def text_to_speech(text):
try:
tts = gTTS(text)
tts.save("response.mp3")
logging.debug("Text-to-speech conversion completed successfully.")
return "response.mp3"
except Exception as e:
logging.error(f"Error during text-to-speech conversion: {e}")
return f"Error during text-to-speech conversion: {e}"
# Combined function for Gradio
def chatbot(audio):
try:
# Step 1: Transcribe the audio input using Whisper
user_input = transcribe_audio(audio)
# Check if transcription returned an error
if "Error" in user_input:
return user_input, None
logging.debug(f"Transcribed text: {user_input}")
# Step 2: Get response from the LLaMA model using Groq API
response_text = get_response(user_input)
# Check if the response generation returned an error
if "Error" in response_text:
return response_text, None
logging.debug(f"Response text: {response_text}")
# Step 3: Convert the response text to speech using gTTS
response_audio = text_to_speech(response_text)
# Check if the text-to-speech conversion returned an error
if "Error" in response_audio:
return response_audio, None
# Step 4: Return the response text and response audio file
return response_text, response_audio
except Exception as e:
logging.error(f"Unexpected error occurred: {e}")
return f"Unexpected error occurred: {e}", None
# Gradio Interface
iface = gr.Interface(
fn=chatbot,
inputs=gr.Audio(type="filepath"),
outputs=[gr.Textbox(label="Response Text"), gr.Audio(label="Response Audio")],
live=True,
title="Voice-to-Voice Chatbot",
description="Speak to the bot, and it will respond with voice.",
)
try:
iface.launch()
except Exception as e:
logging.error(f"Error launching Gradio interface: {e}")