kamal45's picture
Update app.py
c0d86c1 verified
import os
import torch
import whisper
from gtts import gTTS
import gradio as gr
from groq import Groq
import numpy as np
import io
# Load the Whisper model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = whisper.load_model("base", device=device)
GROQ_API_KEY ="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"
Client = Groq(api_key=GROQ_API_KEY)
# Set your Groq API key (replace with your actual key or set it in the environment)
os.environ["GROQ_API_KEY"] = "your_groq_api_key_here"
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
# Function to transcribe audio using Whisper
def transcribe(audio_data):
try:
# Convert numpy array (audio) to bytes and save it as a temporary file
audio_path = "temp_audio.wav"
with open(audio_path, "wb") as f:
f.write(audio_data)
# Transcribe the saved audio file
result = model.transcribe(audio_path)
os.remove(audio_path) # Clean up the temporary file
return result["text"]
except Exception as e:
return f"Error during transcription: {e}"
# Function to get response from Groq's LLM
def get_llm_response(text):
try:
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": text}],
model="llama-3.3-70b-versatile",
)
return chat_completion.choices[0].message.content
except Exception as e:
return f"Error during LLM response generation: {e}"
# Function to convert text to speech
def text_to_speech(text):
try:
tts = gTTS(text, lang="en")
audio_path = "response.mp3"
tts.save(audio_path)
return audio_path
except Exception as e:
return f"Error during text-to-speech conversion: {e}"
# Combined function for processing audio input and generating audio output
def process_audio(audio_data):
transcription = transcribe(audio_data)
if "Error" in transcription:
return transcription, None, None
llm_response = get_llm_response(transcription)
if "Error" in llm_response:
return transcription, llm_response, None
audio_response = text_to_speech(llm_response)
if "Error" in audio_response:
return transcription, llm_response, audio_response
return transcription, llm_response, audio_response
# Build the Gradio interface
with gr.Blocks() as app:
gr.Markdown("## Real-Time Voice-to-Voice Chatbot")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(type="numpy", label="Speak", interactive=True)
with gr.Column():
transcription_output = gr.Textbox(label="Transcription (Text)", lines=2)
response_output = gr.Textbox(label="Response (LLM Text)", lines=2)
audio_output = gr.Audio(label="Response (Audio)")
submit_button = gr.Button("Submit")
# Connect the input and output components
submit_button.click(
process_audio,
inputs=[audio_input],
outputs=[transcription_output, response_output, audio_output],
)
# Launch the app
app.launch()