Files changed (1) hide show
  1. app.py +22 -100
app.py CHANGED
@@ -1,16 +1,10 @@
1
  import os
2
- import io
3
  import torch
4
  import whisper
5
  import streamlit as st
6
  from groq import Groq
7
  from dotenv import load_dotenv
8
  from tempfile import NamedTemporaryFile
9
- from streamlit_webrtc import webrtc_streamer, WebRtcMode, ClientSettings
10
- import av
11
- import numpy as np
12
- import uuid
13
- import time
14
 
15
  # Load environment variables
16
  load_dotenv()
@@ -97,7 +91,7 @@ def transcribe_audio(audio_path, model_size="base"):
97
  # Generate Speech using the configured XTTS model
98
  def generate_speech(text, output_file, speaker_wav, language="en"):
99
  if not os.path.exists(speaker_wav):
100
- raise FileNotFoundError("Reference audio file not found. Please upload or record a valid audio.")
101
 
102
  if language not in supported_languages:
103
  st.warning(f"Language {language} is not supported. Defaulting to English.")
@@ -105,6 +99,7 @@ def generate_speech(text, output_file, speaker_wav, language="en"):
105
 
106
  # Use the configured model directly
107
  try:
 
108
  t_latent = time.time()
109
  gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
110
  audio_path=speaker_wav,
@@ -131,30 +126,6 @@ def generate_speech(text, output_file, speaker_wav, language="en"):
131
  except Exception as e:
132
  return False, f"Error generating speech: {str(e)}"
133
 
134
- # Audio Frame Processing for WebRTC
135
- class AudioProcessor:
136
- def __init__(self):
137
- self.audio_frames = []
138
- self.sample_rate = 24000 # XTTS expects 24kHz
139
-
140
- def recv(self, frame):
141
- sound = frame.to_ndarray()
142
- self.audio_frames.append(sound)
143
- return frame
144
-
145
- def save_audio(self, file_path):
146
- if not self.audio_frames:
147
- return None
148
-
149
- # Concatenate audio frames
150
- concat_audio = np.concatenate(self.audio_frames, axis=0)
151
-
152
- # Save as WAV file
153
- import soundfile as sf
154
- sf.write(file_path, concat_audio, self.sample_rate)
155
-
156
- return file_path
157
-
158
  # Streamlit App
159
  def main():
160
  st.set_page_config(page_title="Vocal AI", layout="wide")
@@ -174,86 +145,37 @@ def main():
174
  # TOS agreement
175
  agree_tos = st.sidebar.checkbox("I agree to the Coqui Public Model License (CPML)", value=False)
176
 
177
- # User option for reference audio (Record or Upload)
178
- ref_audio_choice = st.sidebar.radio("Reference Audio", ("Upload", "Record"))
179
-
180
- ref_audio_path = None
181
- reference_audio_processor = None
182
-
183
  col1, col2 = st.columns(2)
184
 
185
  with col1:
186
  st.header("Step 1: Provide Reference Voice")
187
- if ref_audio_choice == "Upload":
188
- reference_audio = st.file_uploader("Upload Reference Audio", type=["wav", "mp3", "ogg"])
189
- if reference_audio:
190
- with NamedTemporaryFile(delete=False, suffix=".wav") as temp_ref_audio:
191
- temp_ref_audio.write(reference_audio.read())
192
- ref_audio_path = temp_ref_audio.name
193
- st.audio(ref_audio_path)
194
- else:
195
- st.write("Record your reference voice:")
196
- reference_audio_processor = AudioProcessor()
197
- webrtc_ctx = webrtc_streamer(
198
- key="ref_audio",
199
- mode=WebRtcMode.SENDRECV,
200
- client_settings=ClientSettings(
201
- rtc_configuration={"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]},
202
- media_stream_constraints={"audio": True, "video": False},
203
- ),
204
- audio_receiver_size=1024,
205
- video_processor_factory=None,
206
- audio_processor_factory=lambda: reference_audio_processor,
207
- )
208
-
209
- if webrtc_ctx.state.playing and reference_audio_processor is not None:
210
- st.info("Recording... Speak into your microphone.")
211
-
212
- if st.button("Save Reference Audio"):
213
- if reference_audio_processor and reference_audio_processor.audio_frames:
214
- with NamedTemporaryFile(delete=False, suffix=".wav") as temp_ref_audio:
215
- reference_audio_processor.save_audio(temp_ref_audio.name)
216
- ref_audio_path = temp_ref_audio.name
217
- st.success("Reference audio saved!")
218
- st.audio(ref_audio_path)
219
- else:
220
- st.error("No audio recorded. Please speak into your microphone.")
221
 
222
  with col2:
223
  st.header("Step 2: Ask Something")
224
  # User Input (Text or Audio)
225
- input_type = st.radio("Choose Input Type", ("Text", "Audio"))
226
  user_input = None
227
- user_audio_processor = None
228
 
229
  if input_type == "Text":
230
  user_input = st.text_area("Enter your question or prompt here")
231
  else:
232
- st.write("Record your question:")
233
- user_audio_processor = AudioProcessor()
234
- webrtc_ctx_user = webrtc_streamer(
235
- key="user_audio",
236
- mode=WebRtcMode.SENDRECV,
237
- client_settings=ClientSettings(
238
- rtc_configuration={"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]},
239
- media_stream_constraints={"audio": True, "video": False},
240
- ),
241
- audio_receiver_size=1024,
242
- video_processor_factory=None,
243
- audio_processor_factory=lambda: user_audio_processor,
244
- )
245
-
246
- if webrtc_ctx_user.state.playing and user_audio_processor is not None:
247
- st.info("Recording... Ask your question")
248
-
249
- if st.button("Process Recording"):
250
- if user_audio_processor and user_audio_processor.audio_frames:
251
- with NamedTemporaryFile(delete=False, suffix=".wav") as temp_user_audio:
252
- user_audio_processor.save_audio(temp_user_audio.name)
253
- user_input = transcribe_audio(temp_user_audio.name)
254
- st.write(f"Transcribed: {user_input}")
255
- else:
256
- st.error("No audio recorded. Please speak into your microphone.")
257
 
258
  # Process and generate response
259
  if st.button("Generate AI Response in My Voice"):
@@ -262,11 +184,11 @@ def main():
262
  return
263
 
264
  if not ref_audio_path:
265
- st.error("Please provide reference audio (upload or record).")
266
  return
267
 
268
  if not user_input:
269
- st.error("Please enter text or record a question.")
270
  return
271
 
272
  with st.spinner("Processing..."):
 
1
  import os
 
2
  import torch
3
  import whisper
4
  import streamlit as st
5
  from groq import Groq
6
  from dotenv import load_dotenv
7
  from tempfile import NamedTemporaryFile
 
 
 
 
 
8
 
9
  # Load environment variables
10
  load_dotenv()
 
91
  # Generate Speech using the configured XTTS model
92
  def generate_speech(text, output_file, speaker_wav, language="en"):
93
  if not os.path.exists(speaker_wav):
94
+ raise FileNotFoundError("Reference audio file not found. Please upload a valid audio.")
95
 
96
  if language not in supported_languages:
97
  st.warning(f"Language {language} is not supported. Defaulting to English.")
 
99
 
100
  # Use the configured model directly
101
  try:
102
+ import time
103
  t_latent = time.time()
104
  gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
105
  audio_path=speaker_wav,
 
126
  except Exception as e:
127
  return False, f"Error generating speech: {str(e)}"
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  # Streamlit App
130
  def main():
131
  st.set_page_config(page_title="Vocal AI", layout="wide")
 
145
  # TOS agreement
146
  agree_tos = st.sidebar.checkbox("I agree to the Coqui Public Model License (CPML)", value=False)
147
 
148
+ import uuid
149
+
 
 
 
 
150
  col1, col2 = st.columns(2)
151
 
152
  with col1:
153
  st.header("Step 1: Provide Reference Voice")
154
+ reference_audio = st.file_uploader("Upload Reference Audio", type=["wav", "mp3", "ogg"])
155
+ ref_audio_path = None
156
+
157
+ if reference_audio:
158
+ with NamedTemporaryFile(delete=False, suffix=".wav") as temp_ref_audio:
159
+ temp_ref_audio.write(reference_audio.read())
160
+ ref_audio_path = temp_ref_audio.name
161
+ st.audio(ref_audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
  with col2:
164
  st.header("Step 2: Ask Something")
165
  # User Input (Text or Audio)
166
+ input_type = st.radio("Choose Input Type", ("Text", "Upload Audio"))
167
  user_input = None
 
168
 
169
  if input_type == "Text":
170
  user_input = st.text_area("Enter your question or prompt here")
171
  else:
172
+ user_audio = st.file_uploader("Upload your question as audio", type=["wav", "mp3", "ogg"])
173
+ if user_audio:
174
+ with NamedTemporaryFile(delete=False, suffix=".wav") as temp_user_audio:
175
+ temp_user_audio.write(user_audio.read())
176
+ st.audio(temp_user_audio.name)
177
+ user_input = transcribe_audio(temp_user_audio.name)
178
+ st.write(f"Transcribed: {user_input}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
  # Process and generate response
181
  if st.button("Generate AI Response in My Voice"):
 
184
  return
185
 
186
  if not ref_audio_path:
187
+ st.error("Please upload reference audio.")
188
  return
189
 
190
  if not user_input:
191
+ st.error("Please enter text or upload an audio question.")
192
  return
193
 
194
  with st.spinner("Processing..."):