Spaces:
Running
Running
File size: 7,322 Bytes
ddf4b47 ae7b5f6 ddf4b47 e6614a9 9d1f362 ae7b5f6 08bf781 9d1f362 ae7b5f6 ddf4b47 9d1f362 ddf4b47 9d1f362 ddf4b47 9d1f362 ddf4b47 9d1f362 ddf4b47 91743df ddf4b47 9d1f362 91743df 9d1f362 ddf4b47 9d1f362 91743df 9d1f362 91743df ddf4b47 9d1f362 91743df 9d1f362 ddf4b47 9d1f362 91743df ddf4b47 9d1f362 ddf4b47 91743df ddf4b47 91743df ddf4b47 9d1f362 ddf4b47 9d1f362 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
import os
import torch
import whisper
import streamlit as st
from groq import Groq
from dotenv import load_dotenv
from tempfile import NamedTemporaryFile
# Load environment variables
load_dotenv()
API_KEY = os.getenv("GROQ_API_KEY")
HF_TOKEN = os.getenv("HF_TOKEN")
# By using XTTS you agree to CPML license
os.environ["COQUI_TOS_AGREED"] = "1"
# Import TTS components
from TTS.api import TTS
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.utils.generic_utils import get_user_data_dir
# Download and configure XTTS model
print("Downloading Coqui XTTS V2 if not already downloaded")
from TTS.utils.manage import ModelManager
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
ModelManager().download_model(model_name)
model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
print("XTTS downloaded")
config = XttsConfig()
config.load_json(os.path.join(model_path, "config.json"))
model = Xtts.init_from_config(config)
model.load_checkpoint(
config,
checkpoint_path=os.path.join(model_path, "model.pth"),
vocab_path=os.path.join(model_path, "vocab.json"),
eval=True,
use_deepspeed=True,
)
if torch.cuda.is_available():
model.cuda()
supported_languages = config.languages
# LLM Response Function
def get_llm_response(api_key, user_input):
if not api_key:
return "API key not found. Please set the GROQ_API_KEY environment variable."
client = Groq(api_key=api_key)
prompt = (
"IMPORTANT: You are an AI assistant that MUST provide responses in 25 words or less.\n"
"CRITICAL RULES:\n"
"1. NEVER exceed 25 words unless absolutely necessary.\n"
"2. Always give a complete sentence with full context.\n"
"3. Answer directly and precisely.\n"
"4. Use clear, simple language.\n"
"5. Maintain a polite, professional tone.\n"
"6. NO lists, bullet points, or multiple paragraphs.\n"
"7. NEVER apologize for brevity - embrace it.\n"
"Your response will be converted to speech. Maximum 25 words."
)
try:
chat_completion = client.chat.completions.create(
messages=[
{"role": "system", "content": prompt},
{"role": "user", "content": user_input}
],
model="llama3-8b-8192",
temperature=0.5,
top_p=1,
stream=False,
)
return chat_completion.choices[0].message.content
except Exception as e:
return f"Error with LLM: {str(e)}"
# Transcribe Audio
def transcribe_audio(audio_path, model_size="base"):
try:
model = whisper.load_model(model_size)
result = model.transcribe(audio_path)
return result["text"]
except Exception as e:
return f"Error transcribing audio: {str(e)}"
# Generate Speech using the configured XTTS model
def generate_speech(text, output_file, speaker_wav, language="en"):
if not os.path.exists(speaker_wav):
raise FileNotFoundError("Reference audio file not found. Please upload a valid audio.")
if language not in supported_languages:
st.warning(f"Language {language} is not supported. Defaulting to English.")
language = "en"
# Use the configured model directly
try:
import time
t_latent = time.time()
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
audio_path=speaker_wav,
gpt_cond_len=30,
gpt_cond_chunk_len=4,
max_ref_length=60
)
out = model.inference(
text,
language,
gpt_cond_latent,
speaker_embedding,
repetition_penalty=5.0,
temperature=0.75,
)
# Save the audio to file
torch.tensor(out["wav"]).unsqueeze(0).cpu().numpy()
import soundfile as sf
sf.write(output_file, out["wav"], 24000, 'PCM_24')
return True, "Speech generated successfully"
except Exception as e:
return False, f"Error generating speech: {str(e)}"
# Streamlit App
def main():
st.set_page_config(page_title="Vocal AI", layout="wide")
st.title("VocaL AI - Voice Cloning Assistant")
st.write("Clone your voice and interact with an AI assistant that responds in your voice!")
st.sidebar.title("Settings")
# Language selection
language = st.sidebar.selectbox(
"Output Language",
supported_languages,
index=supported_languages.index("en") if "en" in supported_languages else 0
)
# TOS agreement
agree_tos = st.sidebar.checkbox("I agree to the Coqui Public Model License (CPML)", value=False)
import uuid
col1, col2 = st.columns(2)
with col1:
st.header("Step 1: Provide Reference Voice")
reference_audio = st.file_uploader("Upload Reference Audio", type=["wav", "mp3", "ogg"])
ref_audio_path = None
if reference_audio:
with NamedTemporaryFile(delete=False, suffix=".wav") as temp_ref_audio:
temp_ref_audio.write(reference_audio.read())
ref_audio_path = temp_ref_audio.name
st.audio(ref_audio_path)
with col2:
st.header("Step 2: Ask Something")
# User Input (Text or Audio)
input_type = st.radio("Choose Input Type", ("Text", "Upload Audio"))
user_input = None
if input_type == "Text":
user_input = st.text_area("Enter your question or prompt here")
else:
user_audio = st.file_uploader("Upload your question as audio", type=["wav", "mp3", "ogg"])
if user_audio:
with NamedTemporaryFile(delete=False, suffix=".wav") as temp_user_audio:
temp_user_audio.write(user_audio.read())
st.audio(temp_user_audio.name)
user_input = transcribe_audio(temp_user_audio.name)
st.write(f"Transcribed: {user_input}")
# Process and generate response
if st.button("Generate AI Response in My Voice"):
if not agree_tos:
st.error("Please agree to the Coqui Public Model License to continue.")
return
if not ref_audio_path:
st.error("Please upload reference audio.")
return
if not user_input:
st.error("Please enter text or upload an audio question.")
return
with st.spinner("Processing..."):
# Get AI Response
llm_response = get_llm_response(API_KEY, user_input)
st.subheader("AI Response:")
st.write(llm_response)
# Generate Speech
output_audio_path = f"output_speech_{uuid.uuid4()}.wav"
success, message = generate_speech(
llm_response,
output_audio_path,
ref_audio_path,
language
)
if success:
st.subheader("Listen to the response in your voice:")
st.audio(output_audio_path, format="audio/wav")
else:
st.error(message)
if __name__ == "__main__":
main() |