Spaces:

Safwanahmad619
/

voice-to-voice

Sleeping

File size: 5,132 Bytes

import gradio as gr
import whisper
from gtts import gTTS
from groq import Groq
import os
import numpy as np
import soundfile as sf
import logging

# Configure logging
logging.basicConfig(level=logging.DEBUG)

# Initialize the Groq API key from environment variables
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if not GROQ_API_KEY:
    raise RuntimeError("GROQ_API_KEY environment variable not set.")

# Initialize Whisper model (No API key required)
try:
    whisper_model = whisper.load_model("base")
    logging.info("Whisper model loaded successfully.")
except Exception as e:
    raise RuntimeError(f"Error loading Whisper model: {e}")

# Initialize Groq client (API key required for Groq API)
try:
    client = Groq(
        api_key=GROQ_API_KEY  # Use the API key from the environment variable
    )
    logging.info("Groq client initialized successfully.")
except Exception as e:
    raise RuntimeError(f"Error initializing Groq client: {e}")

# Function to transcribe audio using Whisper
def transcribe_audio(audio):
    try:
        # Load audio file with soundfile
        logging.debug(f"Loading audio file: {audio}")
        audio_data, sample_rate = sf.read(audio, dtype='float32')  # Ensure dtype is float32
        logging.debug(f"Audio loaded with sample rate: {sample_rate}, data shape: {audio_data.shape}")

        # Whisper expects a specific sample rate
        if sample_rate != 16000:
            logging.debug(f"Resampling audio from {sample_rate} to 16000 Hz")
            # Resample audio data to 16000 Hz
            num_samples = int(len(audio_data) * (16000 / sample_rate))
            audio_data_resampled = np.interp(np.linspace(0, len(audio_data), num_samples),
                                             np.arange(len(audio_data)),
                                             audio_data)
            audio_data = audio_data_resampled.astype(np.float32)  # Ensure dtype is float32
            sample_rate = 16000

        # Perform the transcription
        result = whisper_model.transcribe(audio_data)
        logging.debug(f"Transcription result: {result['text']}")
        return result['text']
    except Exception as e:
        logging.error(f"Error during transcription: {e}")
        return f"Error during transcription: {e}"

# Function to get response from LLaMA model using Groq API
def get_response(text):
    try:
        logging.debug(f"Sending request to Groq API with text: {text}")
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": text,  # Using the transcribed text as input
                }
            ],
            model="llama3-8b-8192",  # Ensure the correct model is used
        )

        # Extract and return the model's response
        response_text = chat_completion.choices[0].message.content
        logging.debug(f"Received response from Groq API: {response_text}")
        return response_text
    except Exception as e:
        logging.error(f"Error during model response generation: {e}")
        return f"Error during model response generation: {e}"

# Function to convert text to speech using gTTS
def text_to_speech(text):
    try:
        tts = gTTS(text)
        tts.save("response.mp3")
        logging.debug("Text-to-speech conversion completed successfully.")
        return "response.mp3"
    except Exception as e:
        logging.error(f"Error during text-to-speech conversion: {e}")
        return f"Error during text-to-speech conversion: {e}"

# Combined function for Gradio
def chatbot(audio):
    try:
        # Step 1: Transcribe the audio input using Whisper
        user_input = transcribe_audio(audio)

        # Check if transcription returned an error
        if "Error" in user_input:
            return user_input, None

        logging.debug(f"Transcribed text: {user_input}")

        # Step 2: Get response from the LLaMA model using Groq API
        response_text = get_response(user_input)

        # Check if the response generation returned an error
        if "Error" in response_text:
            return response_text, None

        logging.debug(f"Response text: {response_text}")

        # Step 3: Convert the response text to speech using gTTS
        response_audio = text_to_speech(response_text)

        # Check if the text-to-speech conversion returned an error
        if "Error" in response_audio:
            return response_audio, None

        # Step 4: Return the response text and response audio file
        return response_text, response_audio

    except Exception as e:
        logging.error(f"Unexpected error occurred: {e}")
        return f"Unexpected error occurred: {e}", None

# Gradio Interface
iface = gr.Interface(
    fn=chatbot,
    inputs=gr.Audio(type="filepath"),
    outputs=[gr.Textbox(label="Response Text"), gr.Audio(label="Response Audio")],
    live=True,
    title="Voice-to-Voice Chatbot",
    description="Speak to the bot, and it will respond with voice.",
)

try:
    iface.launch()
except Exception as e:
    logging.error(f"Error launching Gradio interface: {e}")