Spaces:
Sleeping
Sleeping
Update app.py
#1
by
Hammad112
- opened
app.py
CHANGED
@@ -1,16 +1,10 @@
|
|
1 |
import os
|
2 |
-
import io
|
3 |
import torch
|
4 |
import whisper
|
5 |
import streamlit as st
|
6 |
from groq import Groq
|
7 |
from dotenv import load_dotenv
|
8 |
from tempfile import NamedTemporaryFile
|
9 |
-
from streamlit_webrtc import webrtc_streamer, WebRtcMode, ClientSettings
|
10 |
-
import av
|
11 |
-
import numpy as np
|
12 |
-
import uuid
|
13 |
-
import time
|
14 |
|
15 |
# Load environment variables
|
16 |
load_dotenv()
|
@@ -97,7 +91,7 @@ def transcribe_audio(audio_path, model_size="base"):
|
|
97 |
# Generate Speech using the configured XTTS model
|
98 |
def generate_speech(text, output_file, speaker_wav, language="en"):
|
99 |
if not os.path.exists(speaker_wav):
|
100 |
-
raise FileNotFoundError("Reference audio file not found. Please upload
|
101 |
|
102 |
if language not in supported_languages:
|
103 |
st.warning(f"Language {language} is not supported. Defaulting to English.")
|
@@ -105,6 +99,7 @@ def generate_speech(text, output_file, speaker_wav, language="en"):
|
|
105 |
|
106 |
# Use the configured model directly
|
107 |
try:
|
|
|
108 |
t_latent = time.time()
|
109 |
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
|
110 |
audio_path=speaker_wav,
|
@@ -131,30 +126,6 @@ def generate_speech(text, output_file, speaker_wav, language="en"):
|
|
131 |
except Exception as e:
|
132 |
return False, f"Error generating speech: {str(e)}"
|
133 |
|
134 |
-
# Audio Frame Processing for WebRTC
|
135 |
-
class AudioProcessor:
|
136 |
-
def __init__(self):
|
137 |
-
self.audio_frames = []
|
138 |
-
self.sample_rate = 24000 # XTTS expects 24kHz
|
139 |
-
|
140 |
-
def recv(self, frame):
|
141 |
-
sound = frame.to_ndarray()
|
142 |
-
self.audio_frames.append(sound)
|
143 |
-
return frame
|
144 |
-
|
145 |
-
def save_audio(self, file_path):
|
146 |
-
if not self.audio_frames:
|
147 |
-
return None
|
148 |
-
|
149 |
-
# Concatenate audio frames
|
150 |
-
concat_audio = np.concatenate(self.audio_frames, axis=0)
|
151 |
-
|
152 |
-
# Save as WAV file
|
153 |
-
import soundfile as sf
|
154 |
-
sf.write(file_path, concat_audio, self.sample_rate)
|
155 |
-
|
156 |
-
return file_path
|
157 |
-
|
158 |
# Streamlit App
|
159 |
def main():
|
160 |
st.set_page_config(page_title="Vocal AI", layout="wide")
|
@@ -174,86 +145,37 @@ def main():
|
|
174 |
# TOS agreement
|
175 |
agree_tos = st.sidebar.checkbox("I agree to the Coqui Public Model License (CPML)", value=False)
|
176 |
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
ref_audio_path = None
|
181 |
-
reference_audio_processor = None
|
182 |
-
|
183 |
col1, col2 = st.columns(2)
|
184 |
|
185 |
with col1:
|
186 |
st.header("Step 1: Provide Reference Voice")
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
st.write("Record your reference voice:")
|
196 |
-
reference_audio_processor = AudioProcessor()
|
197 |
-
webrtc_ctx = webrtc_streamer(
|
198 |
-
key="ref_audio",
|
199 |
-
mode=WebRtcMode.SENDRECV,
|
200 |
-
client_settings=ClientSettings(
|
201 |
-
rtc_configuration={"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]},
|
202 |
-
media_stream_constraints={"audio": True, "video": False},
|
203 |
-
),
|
204 |
-
audio_receiver_size=1024,
|
205 |
-
video_processor_factory=None,
|
206 |
-
audio_processor_factory=lambda: reference_audio_processor,
|
207 |
-
)
|
208 |
-
|
209 |
-
if webrtc_ctx.state.playing and reference_audio_processor is not None:
|
210 |
-
st.info("Recording... Speak into your microphone.")
|
211 |
-
|
212 |
-
if st.button("Save Reference Audio"):
|
213 |
-
if reference_audio_processor and reference_audio_processor.audio_frames:
|
214 |
-
with NamedTemporaryFile(delete=False, suffix=".wav") as temp_ref_audio:
|
215 |
-
reference_audio_processor.save_audio(temp_ref_audio.name)
|
216 |
-
ref_audio_path = temp_ref_audio.name
|
217 |
-
st.success("Reference audio saved!")
|
218 |
-
st.audio(ref_audio_path)
|
219 |
-
else:
|
220 |
-
st.error("No audio recorded. Please speak into your microphone.")
|
221 |
|
222 |
with col2:
|
223 |
st.header("Step 2: Ask Something")
|
224 |
# User Input (Text or Audio)
|
225 |
-
input_type = st.radio("Choose Input Type", ("Text", "Audio"))
|
226 |
user_input = None
|
227 |
-
user_audio_processor = None
|
228 |
|
229 |
if input_type == "Text":
|
230 |
user_input = st.text_area("Enter your question or prompt here")
|
231 |
else:
|
232 |
-
st.
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
media_stream_constraints={"audio": True, "video": False},
|
240 |
-
),
|
241 |
-
audio_receiver_size=1024,
|
242 |
-
video_processor_factory=None,
|
243 |
-
audio_processor_factory=lambda: user_audio_processor,
|
244 |
-
)
|
245 |
-
|
246 |
-
if webrtc_ctx_user.state.playing and user_audio_processor is not None:
|
247 |
-
st.info("Recording... Ask your question")
|
248 |
-
|
249 |
-
if st.button("Process Recording"):
|
250 |
-
if user_audio_processor and user_audio_processor.audio_frames:
|
251 |
-
with NamedTemporaryFile(delete=False, suffix=".wav") as temp_user_audio:
|
252 |
-
user_audio_processor.save_audio(temp_user_audio.name)
|
253 |
-
user_input = transcribe_audio(temp_user_audio.name)
|
254 |
-
st.write(f"Transcribed: {user_input}")
|
255 |
-
else:
|
256 |
-
st.error("No audio recorded. Please speak into your microphone.")
|
257 |
|
258 |
# Process and generate response
|
259 |
if st.button("Generate AI Response in My Voice"):
|
@@ -262,11 +184,11 @@ def main():
|
|
262 |
return
|
263 |
|
264 |
if not ref_audio_path:
|
265 |
-
st.error("Please
|
266 |
return
|
267 |
|
268 |
if not user_input:
|
269 |
-
st.error("Please enter text or
|
270 |
return
|
271 |
|
272 |
with st.spinner("Processing..."):
|
|
|
1 |
import os
|
|
|
2 |
import torch
|
3 |
import whisper
|
4 |
import streamlit as st
|
5 |
from groq import Groq
|
6 |
from dotenv import load_dotenv
|
7 |
from tempfile import NamedTemporaryFile
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
# Load environment variables
|
10 |
load_dotenv()
|
|
|
91 |
# Generate Speech using the configured XTTS model
|
92 |
def generate_speech(text, output_file, speaker_wav, language="en"):
|
93 |
if not os.path.exists(speaker_wav):
|
94 |
+
raise FileNotFoundError("Reference audio file not found. Please upload a valid audio.")
|
95 |
|
96 |
if language not in supported_languages:
|
97 |
st.warning(f"Language {language} is not supported. Defaulting to English.")
|
|
|
99 |
|
100 |
# Use the configured model directly
|
101 |
try:
|
102 |
+
import time
|
103 |
t_latent = time.time()
|
104 |
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
|
105 |
audio_path=speaker_wav,
|
|
|
126 |
except Exception as e:
|
127 |
return False, f"Error generating speech: {str(e)}"
|
128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
# Streamlit App
|
130 |
def main():
|
131 |
st.set_page_config(page_title="Vocal AI", layout="wide")
|
|
|
145 |
# TOS agreement
|
146 |
agree_tos = st.sidebar.checkbox("I agree to the Coqui Public Model License (CPML)", value=False)
|
147 |
|
148 |
+
import uuid
|
149 |
+
|
|
|
|
|
|
|
|
|
150 |
col1, col2 = st.columns(2)
|
151 |
|
152 |
with col1:
|
153 |
st.header("Step 1: Provide Reference Voice")
|
154 |
+
reference_audio = st.file_uploader("Upload Reference Audio", type=["wav", "mp3", "ogg"])
|
155 |
+
ref_audio_path = None
|
156 |
+
|
157 |
+
if reference_audio:
|
158 |
+
with NamedTemporaryFile(delete=False, suffix=".wav") as temp_ref_audio:
|
159 |
+
temp_ref_audio.write(reference_audio.read())
|
160 |
+
ref_audio_path = temp_ref_audio.name
|
161 |
+
st.audio(ref_audio_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
with col2:
|
164 |
st.header("Step 2: Ask Something")
|
165 |
# User Input (Text or Audio)
|
166 |
+
input_type = st.radio("Choose Input Type", ("Text", "Upload Audio"))
|
167 |
user_input = None
|
|
|
168 |
|
169 |
if input_type == "Text":
|
170 |
user_input = st.text_area("Enter your question or prompt here")
|
171 |
else:
|
172 |
+
user_audio = st.file_uploader("Upload your question as audio", type=["wav", "mp3", "ogg"])
|
173 |
+
if user_audio:
|
174 |
+
with NamedTemporaryFile(delete=False, suffix=".wav") as temp_user_audio:
|
175 |
+
temp_user_audio.write(user_audio.read())
|
176 |
+
st.audio(temp_user_audio.name)
|
177 |
+
user_input = transcribe_audio(temp_user_audio.name)
|
178 |
+
st.write(f"Transcribed: {user_input}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
|
180 |
# Process and generate response
|
181 |
if st.button("Generate AI Response in My Voice"):
|
|
|
184 |
return
|
185 |
|
186 |
if not ref_audio_path:
|
187 |
+
st.error("Please upload reference audio.")
|
188 |
return
|
189 |
|
190 |
if not user_input:
|
191 |
+
st.error("Please enter text or upload an audio question.")
|
192 |
return
|
193 |
|
194 |
with st.spinner("Processing..."):
|