FYP-VocalAI / app.py
Hammad112's picture
Update app.py
91743df verified
raw
history blame
7.32 kB
import os
import torch
import whisper
import streamlit as st
from groq import Groq
from dotenv import load_dotenv
from tempfile import NamedTemporaryFile
# Load environment variables
load_dotenv()
API_KEY = os.getenv("GROQ_API_KEY")
HF_TOKEN = os.getenv("HF_TOKEN")
# By using XTTS you agree to CPML license
os.environ["COQUI_TOS_AGREED"] = "1"
# Import TTS components
from TTS.api import TTS
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.utils.generic_utils import get_user_data_dir
# Download and configure XTTS model
print("Downloading Coqui XTTS V2 if not already downloaded")
from TTS.utils.manage import ModelManager
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
ModelManager().download_model(model_name)
model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
print("XTTS downloaded")
config = XttsConfig()
config.load_json(os.path.join(model_path, "config.json"))
model = Xtts.init_from_config(config)
model.load_checkpoint(
config,
checkpoint_path=os.path.join(model_path, "model.pth"),
vocab_path=os.path.join(model_path, "vocab.json"),
eval=True,
use_deepspeed=True,
)
if torch.cuda.is_available():
model.cuda()
supported_languages = config.languages
# LLM Response Function
def get_llm_response(api_key, user_input):
if not api_key:
return "API key not found. Please set the GROQ_API_KEY environment variable."
client = Groq(api_key=api_key)
prompt = (
"IMPORTANT: You are an AI assistant that MUST provide responses in 25 words or less.\n"
"CRITICAL RULES:\n"
"1. NEVER exceed 25 words unless absolutely necessary.\n"
"2. Always give a complete sentence with full context.\n"
"3. Answer directly and precisely.\n"
"4. Use clear, simple language.\n"
"5. Maintain a polite, professional tone.\n"
"6. NO lists, bullet points, or multiple paragraphs.\n"
"7. NEVER apologize for brevity - embrace it.\n"
"Your response will be converted to speech. Maximum 25 words."
)
try:
chat_completion = client.chat.completions.create(
messages=[
{"role": "system", "content": prompt},
{"role": "user", "content": user_input}
],
model="llama3-8b-8192",
temperature=0.5,
top_p=1,
stream=False,
)
return chat_completion.choices[0].message.content
except Exception as e:
return f"Error with LLM: {str(e)}"
# Transcribe Audio
def transcribe_audio(audio_path, model_size="base"):
try:
model = whisper.load_model(model_size)
result = model.transcribe(audio_path)
return result["text"]
except Exception as e:
return f"Error transcribing audio: {str(e)}"
# Generate Speech using the configured XTTS model
def generate_speech(text, output_file, speaker_wav, language="en"):
if not os.path.exists(speaker_wav):
raise FileNotFoundError("Reference audio file not found. Please upload a valid audio.")
if language not in supported_languages:
st.warning(f"Language {language} is not supported. Defaulting to English.")
language = "en"
# Use the configured model directly
try:
import time
t_latent = time.time()
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
audio_path=speaker_wav,
gpt_cond_len=30,
gpt_cond_chunk_len=4,
max_ref_length=60
)
out = model.inference(
text,
language,
gpt_cond_latent,
speaker_embedding,
repetition_penalty=5.0,
temperature=0.75,
)
# Save the audio to file
torch.tensor(out["wav"]).unsqueeze(0).cpu().numpy()
import soundfile as sf
sf.write(output_file, out["wav"], 24000, 'PCM_24')
return True, "Speech generated successfully"
except Exception as e:
return False, f"Error generating speech: {str(e)}"
# Streamlit App
def main():
st.set_page_config(page_title="Vocal AI", layout="wide")
st.title("VocaL AI - Voice Cloning Assistant")
st.write("Clone your voice and interact with an AI assistant that responds in your voice!")
st.sidebar.title("Settings")
# Language selection
language = st.sidebar.selectbox(
"Output Language",
supported_languages,
index=supported_languages.index("en") if "en" in supported_languages else 0
)
# TOS agreement
agree_tos = st.sidebar.checkbox("I agree to the Coqui Public Model License (CPML)", value=False)
import uuid
col1, col2 = st.columns(2)
with col1:
st.header("Step 1: Provide Reference Voice")
reference_audio = st.file_uploader("Upload Reference Audio", type=["wav", "mp3", "ogg"])
ref_audio_path = None
if reference_audio:
with NamedTemporaryFile(delete=False, suffix=".wav") as temp_ref_audio:
temp_ref_audio.write(reference_audio.read())
ref_audio_path = temp_ref_audio.name
st.audio(ref_audio_path)
with col2:
st.header("Step 2: Ask Something")
# User Input (Text or Audio)
input_type = st.radio("Choose Input Type", ("Text", "Upload Audio"))
user_input = None
if input_type == "Text":
user_input = st.text_area("Enter your question or prompt here")
else:
user_audio = st.file_uploader("Upload your question as audio", type=["wav", "mp3", "ogg"])
if user_audio:
with NamedTemporaryFile(delete=False, suffix=".wav") as temp_user_audio:
temp_user_audio.write(user_audio.read())
st.audio(temp_user_audio.name)
user_input = transcribe_audio(temp_user_audio.name)
st.write(f"Transcribed: {user_input}")
# Process and generate response
if st.button("Generate AI Response in My Voice"):
if not agree_tos:
st.error("Please agree to the Coqui Public Model License to continue.")
return
if not ref_audio_path:
st.error("Please upload reference audio.")
return
if not user_input:
st.error("Please enter text or upload an audio question.")
return
with st.spinner("Processing..."):
# Get AI Response
llm_response = get_llm_response(API_KEY, user_input)
st.subheader("AI Response:")
st.write(llm_response)
# Generate Speech
output_audio_path = f"output_speech_{uuid.uuid4()}.wav"
success, message = generate_speech(
llm_response,
output_audio_path,
ref_audio_path,
language
)
if success:
st.subheader("Listen to the response in your voice:")
st.audio(output_audio_path, format="audio/wav")
else:
st.error(message)
if __name__ == "__main__":
main()