FYP-VocalAI / app.py
adeel707's picture
Update app.py
72cfb7f verified
import streamlit as st
import tempfile
import os
import torch
import whisper
from TTS.api import TTS
from groq import Groq
from pydub import AudioSegment
from streamlit_option_menu import option_menu
import numba
# Hardcoded Groq API Key
GROQ_API_KEY = "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"
# Load TTS model
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
def get_llm_response(user_input):
client = Groq(api_key=GROQ_API_KEY)
prompt = ("IMPORTANT: You are an AI assistant that MUST provide responses in 25 words or less. NO EXCEPTIONS...")
chat_completion = client.chat.completions.create(
messages=[
{"role": "system", "content": prompt},
{"role": "user", "content": user_input}
],
model="llama3-8b-8192",
temperature=0.5,
top_p=1,
stop=None,
stream=False,
)
return chat_completion.choices[0].message.content
def transcribe_audio(audio_path, model_size="base"):
model = whisper.load_model(model_size)
result = model.transcribe(audio_path)
return result["text"]
def generate_speech(text, output_file, speaker_wav, language="hi"):
tts.tts_to_file(text=text, file_path=output_file, speaker_wav=speaker_wav, language=language)
# UI Design
st.title("πŸ—£οΈ VocalAI - AI-Powered Voice Cloning & Chatbot")
selected_page = option_menu(
menu_title=None,
options=["Text-to-Speech", "Voice-Cloned Chatbot"],
icons=["mic", "chat-dots"],
menu_icon="cast",
default_index=0,
orientation="horizontal"
)
# Sidebar - Reference Speaker Audio Upload
st.sidebar.header("Upload Reference Audio")
ref_audio = st.sidebar.file_uploader("Upload a speaker audio file (WAV format)", type=["wav", "ogg", "mp3"])
if selected_page == "Text-to-Speech":
st.header("πŸ”Š Text-to-Speech (TTS)")
text = st.text_area("Enter text to synthesize:", "Hello, this is a cloned voice test.")
if st.button("Generate Voice"):
if ref_audio is None:
st.warning("⚠️ Please upload a reference speaker audio file first!")
else:
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_speaker:
temp_speaker.write(ref_audio.read())
speaker_wav_path = temp_speaker.name
output_path = "cloned_output.wav"
generate_speech(text, output_path, speaker_wav_path, language="en")
st.audio(output_path, format="audio/wav")
with open(output_path, "rb") as f:
st.download_button("Download Cloned Voice", f, file_name="cloned_voice.wav", mime="audio/wav")
os.unlink(speaker_wav_path)
elif selected_page == "Voice-Cloned Chatbot":
st.header("πŸ’¬ AI Chatbot with Voice Cloning")
user_query = st.text_area("Enter your query:", "Hello, explain AI briefly.")
uploaded_voice = st.file_uploader("Or upload an audio query (WAV format)", type=["wav", "ogg", "mp3"])
if uploaded_voice is not None:
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
temp_audio.write(uploaded_voice.read())
audio_path = temp_audio.name
user_query = transcribe_audio(audio_path)
os.unlink(audio_path)
st.write("**Transcribed Query:**", user_query)
if st.button("Generate Response"):
if ref_audio is None:
st.warning("⚠️ Please upload a reference speaker audio file first!")
else:
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_speaker:
temp_speaker.write(ref_audio.read())
speaker_wav_path = temp_speaker.name
response = get_llm_response(user_query)
output_audio_path = "cloned_chat_response.wav"
generate_speech(response, output_audio_path, speaker_wav_path, language="en")
st.audio(output_audio_path, format="audio/wav")
with open(output_audio_path, "rb") as f:
st.download_button("Download Response Audio", f, file_name="cloned_chat_response.wav", mime="audio/wav")
os.unlink(speaker_wav_path)