kamal45's picture
Update app.py
c0d86c1 verified
raw
history blame
3.12 kB
import os
import torch
import whisper
from gtts import gTTS
import gradio as gr
from groq import Groq
import numpy as np
import io
# Load the Whisper model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = whisper.load_model("base", device=device)
GROQ_API_KEY ="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"
Client = Groq(api_key=GROQ_API_KEY)
# Set your Groq API key (replace with your actual key or set it in the environment)
os.environ["GROQ_API_KEY"] = "your_groq_api_key_here"
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
# Function to transcribe audio using Whisper
def transcribe(audio_data):
try:
# Convert numpy array (audio) to bytes and save it as a temporary file
audio_path = "temp_audio.wav"
with open(audio_path, "wb") as f:
f.write(audio_data)
# Transcribe the saved audio file
result = model.transcribe(audio_path)
os.remove(audio_path) # Clean up the temporary file
return result["text"]
except Exception as e:
return f"Error during transcription: {e}"
# Function to get response from Groq's LLM
def get_llm_response(text):
try:
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": text}],
model="llama-3.3-70b-versatile",
)
return chat_completion.choices[0].message.content
except Exception as e:
return f"Error during LLM response generation: {e}"
# Function to convert text to speech
def text_to_speech(text):
try:
tts = gTTS(text, lang="en")
audio_path = "response.mp3"
tts.save(audio_path)
return audio_path
except Exception as e:
return f"Error during text-to-speech conversion: {e}"
# Combined function for processing audio input and generating audio output
def process_audio(audio_data):
transcription = transcribe(audio_data)
if "Error" in transcription:
return transcription, None, None
llm_response = get_llm_response(transcription)
if "Error" in llm_response:
return transcription, llm_response, None
audio_response = text_to_speech(llm_response)
if "Error" in audio_response:
return transcription, llm_response, audio_response
return transcription, llm_response, audio_response
# Build the Gradio interface
with gr.Blocks() as app:
gr.Markdown("## Real-Time Voice-to-Voice Chatbot")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(type="numpy", label="Speak", interactive=True)
with gr.Column():
transcription_output = gr.Textbox(label="Transcription (Text)", lines=2)
response_output = gr.Textbox(label="Response (LLM Text)", lines=2)
audio_output = gr.Audio(label="Response (Audio)")
submit_button = gr.Button("Submit")
# Connect the input and output components
submit_button.click(
process_audio,
inputs=[audio_input],
outputs=[transcription_output, response_output, audio_output],
)
# Launch the app
app.launch()