Spaces:

kamal45
/

Real-Time-Chatbot

Running

File size: 3,119 Bytes

import os
import torch
import whisper
from gtts import gTTS
import gradio as gr
from groq import Groq
import numpy as np
import io

# Load the Whisper model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = whisper.load_model("base", device=device)

GROQ_API_KEY ="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"

Client = Groq(api_key=GROQ_API_KEY)

# Set your Groq API key (replace with your actual key or set it in the environment)
os.environ["GROQ_API_KEY"] = "your_groq_api_key_here"
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

# Function to transcribe audio using Whisper
def transcribe(audio_data):
    try:
        # Convert numpy array (audio) to bytes and save it as a temporary file
        audio_path = "temp_audio.wav"
        with open(audio_path, "wb") as f:
            f.write(audio_data)
        
        # Transcribe the saved audio file
        result = model.transcribe(audio_path)
        os.remove(audio_path)  # Clean up the temporary file
        return result["text"]
    except Exception as e:
        return f"Error during transcription: {e}"

# Function to get response from Groq's LLM
def get_llm_response(text):
    try:
        chat_completion = client.chat.completions.create(
            messages=[{"role": "user", "content": text}],
            model="llama-3.3-70b-versatile",
        )
        return chat_completion.choices[0].message.content
    except Exception as e:
        return f"Error during LLM response generation: {e}"

# Function to convert text to speech
def text_to_speech(text):
    try:
        tts = gTTS(text, lang="en")
        audio_path = "response.mp3"
        tts.save(audio_path)
        return audio_path
    except Exception as e:
        return f"Error during text-to-speech conversion: {e}"

# Combined function for processing audio input and generating audio output
def process_audio(audio_data):
    transcription = transcribe(audio_data)
    if "Error" in transcription:
        return transcription, None, None
    
    llm_response = get_llm_response(transcription)
    if "Error" in llm_response:
        return transcription, llm_response, None
    
    audio_response = text_to_speech(llm_response)
    if "Error" in audio_response:
        return transcription, llm_response, audio_response

    return transcription, llm_response, audio_response

# Build the Gradio interface
with gr.Blocks() as app:
    gr.Markdown("## Real-Time Voice-to-Voice Chatbot")
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(type="numpy", label="Speak", interactive=True)
        with gr.Column():
            transcription_output = gr.Textbox(label="Transcription (Text)", lines=2)
            response_output = gr.Textbox(label="Response (LLM Text)", lines=2)
            audio_output = gr.Audio(label="Response (Audio)")
    submit_button = gr.Button("Submit")

    # Connect the input and output components
    submit_button.click(
        process_audio,
        inputs=[audio_input],
        outputs=[transcription_output, response_output, audio_output],
    )

# Launch the app
app.launch()