Spaces:

adeel707
/

FYP-VocalAI

Sleeping

App Files Files Community

FYP-VocalAI / app.py

Hammad112

Update app.py

91743df verified about 1 month ago

raw

history blame

7.32 kB

	import os
	import torch
	import whisper
	import streamlit as st
	from groq import Groq
	from dotenv import load_dotenv
	from tempfile import NamedTemporaryFile

	# Load environment variables
	load_dotenv()
	API_KEY = os.getenv("GROQ_API_KEY")
	HF_TOKEN = os.getenv("HF_TOKEN")

	# By using XTTS you agree to CPML license
	os.environ["COQUI_TOS_AGREED"] = "1"

	# Import TTS components
	from TTS.api import TTS
	from TTS.tts.configs.xtts_config import XttsConfig
	from TTS.tts.models.xtts import Xtts
	from TTS.utils.generic_utils import get_user_data_dir

	# Download and configure XTTS model
	print("Downloading Coqui XTTS V2 if not already downloaded")
	from TTS.utils.manage import ModelManager

	model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
	ModelManager().download_model(model_name)
	model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
	print("XTTS downloaded")

	config = XttsConfig()
	config.load_json(os.path.join(model_path, "config.json"))

	model = Xtts.init_from_config(config)
	model.load_checkpoint(
	config,
	checkpoint_path=os.path.join(model_path, "model.pth"),
	vocab_path=os.path.join(model_path, "vocab.json"),
	eval=True,
	use_deepspeed=True,
	)
	if torch.cuda.is_available():
	model.cuda()

	supported_languages = config.languages

	# LLM Response Function
	def get_llm_response(api_key, user_input):
	if not api_key:
	return "API key not found. Please set the GROQ_API_KEY environment variable."

	client = Groq(api_key=api_key)
	prompt = (
	"IMPORTANT: You are an AI assistant that MUST provide responses in 25 words or less.\n"
	"CRITICAL RULES:\n"
	"1. NEVER exceed 25 words unless absolutely necessary.\n"
	"2. Always give a complete sentence with full context.\n"
	"3. Answer directly and precisely.\n"
	"4. Use clear, simple language.\n"
	"5. Maintain a polite, professional tone.\n"
	"6. NO lists, bullet points, or multiple paragraphs.\n"
	"7. NEVER apologize for brevity - embrace it.\n"
	"Your response will be converted to speech. Maximum 25 words."
	)

	try:
	chat_completion = client.chat.completions.create(
	messages=[
	{"role": "system", "content": prompt},
	{"role": "user", "content": user_input}
	],
	model="llama3-8b-8192",
	temperature=0.5,
	top_p=1,
	stream=False,
	)
	return chat_completion.choices[0].message.content
	except Exception as e:
	return f"Error with LLM: {str(e)}"

	# Transcribe Audio
	def transcribe_audio(audio_path, model_size="base"):
	try:
	model = whisper.load_model(model_size)
	result = model.transcribe(audio_path)
	return result["text"]
	except Exception as e:
	return f"Error transcribing audio: {str(e)}"

	# Generate Speech using the configured XTTS model
	def generate_speech(text, output_file, speaker_wav, language="en"):
	if not os.path.exists(speaker_wav):
	raise FileNotFoundError("Reference audio file not found. Please upload a valid audio.")

	if language not in supported_languages:
	st.warning(f"Language {language} is not supported. Defaulting to English.")
	language = "en"

	# Use the configured model directly
	try:
	import time
	t_latent = time.time()
	gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
	audio_path=speaker_wav,
	gpt_cond_len=30,
	gpt_cond_chunk_len=4,
	max_ref_length=60
	)

	out = model.inference(
	text,
	language,
	gpt_cond_latent,
	speaker_embedding,
	repetition_penalty=5.0,
	temperature=0.75,
	)

	# Save the audio to file
	torch.tensor(out["wav"]).unsqueeze(0).cpu().numpy()
	import soundfile as sf
	sf.write(output_file, out["wav"], 24000, 'PCM_24')

	return True, "Speech generated successfully"
	except Exception as e:
	return False, f"Error generating speech: {str(e)}"

	# Streamlit App
	def main():
	st.set_page_config(page_title="Vocal AI", layout="wide")

	st.title("VocaL AI - Voice Cloning Assistant")
	st.write("Clone your voice and interact with an AI assistant that responds in your voice!")

	st.sidebar.title("Settings")

	# Language selection
	language = st.sidebar.selectbox(
	"Output Language",
	supported_languages,
	index=supported_languages.index("en") if "en" in supported_languages else 0
	)

	# TOS agreement
	agree_tos = st.sidebar.checkbox("I agree to the Coqui Public Model License (CPML)", value=False)

	import uuid

	col1, col2 = st.columns(2)

	with col1:
	st.header("Step 1: Provide Reference Voice")
	reference_audio = st.file_uploader("Upload Reference Audio", type=["wav", "mp3", "ogg"])
	ref_audio_path = None

	if reference_audio:
	with NamedTemporaryFile(delete=False, suffix=".wav") as temp_ref_audio:
	temp_ref_audio.write(reference_audio.read())
	ref_audio_path = temp_ref_audio.name
	st.audio(ref_audio_path)

	with col2:
	st.header("Step 2: Ask Something")
	# User Input (Text or Audio)
	input_type = st.radio("Choose Input Type", ("Text", "Upload Audio"))
	user_input = None

	if input_type == "Text":
	user_input = st.text_area("Enter your question or prompt here")
	else:
	user_audio = st.file_uploader("Upload your question as audio", type=["wav", "mp3", "ogg"])
	if user_audio:
	with NamedTemporaryFile(delete=False, suffix=".wav") as temp_user_audio:
	temp_user_audio.write(user_audio.read())
	st.audio(temp_user_audio.name)
	user_input = transcribe_audio(temp_user_audio.name)
	st.write(f"Transcribed: {user_input}")

	# Process and generate response
	if st.button("Generate AI Response in My Voice"):
	if not agree_tos:
	st.error("Please agree to the Coqui Public Model License to continue.")
	return

	if not ref_audio_path:
	st.error("Please upload reference audio.")
	return

	if not user_input:
	st.error("Please enter text or upload an audio question.")
	return

	with st.spinner("Processing..."):
	# Get AI Response
	llm_response = get_llm_response(API_KEY, user_input)
	st.subheader("AI Response:")
	st.write(llm_response)

	# Generate Speech
	output_audio_path = f"output_speech_{uuid.uuid4()}.wav"
	success, message = generate_speech(
	llm_response,
	output_audio_path,
	ref_audio_path,
	language
	)

	if success:
	st.subheader("Listen to the response in your voice:")
	st.audio(output_audio_path, format="audio/wav")
	else:
	st.error(message)

	if __name__ == "__main__":
	main()