Spaces:
Running
Running
import streamlit as st | |
import tempfile | |
import os | |
import torch | |
import whisper | |
from TTS.api import TTS | |
from groq import Groq | |
from pydub import AudioSegment | |
from streamlit_option_menu import option_menu | |
import numba | |
# Hardcoded Groq API Key | |
GROQ_API_KEY = "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" | |
# Load TTS model | |
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False) | |
def get_llm_response(user_input): | |
client = Groq(api_key=GROQ_API_KEY) | |
prompt = ("IMPORTANT: You are an AI assistant that MUST provide responses in 25 words or less. NO EXCEPTIONS...") | |
chat_completion = client.chat.completions.create( | |
messages=[ | |
{"role": "system", "content": prompt}, | |
{"role": "user", "content": user_input} | |
], | |
model="llama3-8b-8192", | |
temperature=0.5, | |
top_p=1, | |
stop=None, | |
stream=False, | |
) | |
return chat_completion.choices[0].message.content | |
def transcribe_audio(audio_path, model_size="base"): | |
model = whisper.load_model(model_size) | |
result = model.transcribe(audio_path) | |
return result["text"] | |
def generate_speech(text, output_file, speaker_wav, language="hi"): | |
tts.tts_to_file(text=text, file_path=output_file, speaker_wav=speaker_wav, language=language) | |
# UI Design | |
st.title("π£οΈ VocalAI - AI-Powered Voice Cloning & Chatbot") | |
selected_page = option_menu( | |
menu_title=None, | |
options=["Text-to-Speech", "Voice-Cloned Chatbot"], | |
icons=["mic", "chat-dots"], | |
menu_icon="cast", | |
default_index=0, | |
orientation="horizontal" | |
) | |
# Sidebar - Reference Speaker Audio Upload | |
st.sidebar.header("Upload Reference Audio") | |
ref_audio = st.sidebar.file_uploader("Upload a speaker audio file (WAV format)", type=["wav", "ogg", "mp3"]) | |
if selected_page == "Text-to-Speech": | |
st.header("π Text-to-Speech (TTS)") | |
text = st.text_area("Enter text to synthesize:", "Hello, this is a cloned voice test.") | |
if st.button("Generate Voice"): | |
if ref_audio is None: | |
st.warning("β οΈ Please upload a reference speaker audio file first!") | |
else: | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_speaker: | |
temp_speaker.write(ref_audio.read()) | |
speaker_wav_path = temp_speaker.name | |
output_path = "cloned_output.wav" | |
generate_speech(text, output_path, speaker_wav_path, language="en") | |
st.audio(output_path, format="audio/wav") | |
with open(output_path, "rb") as f: | |
st.download_button("Download Cloned Voice", f, file_name="cloned_voice.wav", mime="audio/wav") | |
os.unlink(speaker_wav_path) | |
elif selected_page == "Voice-Cloned Chatbot": | |
st.header("π¬ AI Chatbot with Voice Cloning") | |
user_query = st.text_area("Enter your query:", "Hello, explain AI briefly.") | |
uploaded_voice = st.file_uploader("Or upload an audio query (WAV format)", type=["wav", "ogg", "mp3"]) | |
if uploaded_voice is not None: | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: | |
temp_audio.write(uploaded_voice.read()) | |
audio_path = temp_audio.name | |
user_query = transcribe_audio(audio_path) | |
os.unlink(audio_path) | |
st.write("**Transcribed Query:**", user_query) | |
if st.button("Generate Response"): | |
if ref_audio is None: | |
st.warning("β οΈ Please upload a reference speaker audio file first!") | |
else: | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_speaker: | |
temp_speaker.write(ref_audio.read()) | |
speaker_wav_path = temp_speaker.name | |
response = get_llm_response(user_query) | |
output_audio_path = "cloned_chat_response.wav" | |
generate_speech(response, output_audio_path, speaker_wav_path, language="en") | |
st.audio(output_audio_path, format="audio/wav") | |
with open(output_audio_path, "rb") as f: | |
st.download_button("Download Response Audio", f, file_name="cloned_chat_response.wav", mime="audio/wav") | |
os.unlink(speaker_wav_path) | |