Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -203,49 +203,30 @@ pipe_asr = pipeline(
|
|
203 |
tokenizer=processor.tokenizer,
|
204 |
feature_extractor=processor.feature_extractor,
|
205 |
max_new_tokens=128,
|
206 |
-
chunk_length_s=
|
207 |
batch_size=16,
|
208 |
torch_dtype=torch_dtype,
|
209 |
device=device,
|
210 |
-
return_timestamps=
|
211 |
)
|
212 |
|
213 |
-
#
|
214 |
-
|
215 |
-
|
216 |
-
# Function to handle voice input, generate response from Neo4j, and return audio output
|
217 |
-
def handle_voice_to_voice(audio):
|
218 |
try:
|
219 |
-
# Transcribe audio input to text
|
220 |
sr, y = audio
|
221 |
y = y.astype(np.float32)
|
222 |
y = y / np.max(np.abs(y)) # Normalize audio to range [-1.0, 1.0]
|
223 |
|
224 |
-
|
225 |
-
logging.debug(f"Audio data: {y[:100]}") # Log first 100 samples for brevity
|
226 |
-
|
227 |
-
# Process the audio data with Whisper ASR
|
228 |
result = pipe_asr({"array": y, "sampling_rate": sr}, return_timestamps=False)
|
229 |
-
|
230 |
-
|
231 |
-
logging.debug(f"Transcribed question: {question}")
|
232 |
-
|
233 |
-
if not question:
|
234 |
-
return "No transcription available, please try again.", ""
|
235 |
|
236 |
-
|
237 |
-
|
238 |
-
logging.debug(f"Response from Neo4j and GPT: {response}")
|
239 |
-
|
240 |
-
# Generate audio from the response
|
241 |
-
audio_path = generate_audio_elevenlabs(response)
|
242 |
-
logging.debug(f"Generated audio path: {audio_path}")
|
243 |
-
|
244 |
-
# Return the transcription and the audio path
|
245 |
-
return audio_path, question
|
246 |
except Exception as e:
|
247 |
-
logging.error(f"Error
|
248 |
-
return "Error processing the audio, please try again."
|
|
|
249 |
|
250 |
# Function to clear the transcription state
|
251 |
def clear_state():
|
@@ -254,21 +235,33 @@ def clear_state():
|
|
254 |
# Define the Gradio interface
|
255 |
with gr.Blocks(theme="rawrsor1/Everforest") as demo:
|
256 |
audio_input = gr.Audio(sources=["microphone"], type='numpy', streaming=True, label="Speak to Ask")
|
|
|
257 |
submit_voice_btn = gr.Button("Submit Voice")
|
258 |
clear_state_btn = gr.Button("Clear State")
|
259 |
-
transcription_textbox = gr.Textbox(label="Transcription", interactive=False)
|
260 |
audio_output = gr.Audio(label="Response Audio", type="filepath", autoplay=True, interactive=False)
|
261 |
|
262 |
-
#
|
263 |
-
|
264 |
-
fn=
|
265 |
inputs=audio_input,
|
266 |
-
outputs=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
)
|
268 |
|
269 |
# Interaction for Clear State Button
|
270 |
clear_state_btn.click(
|
271 |
-
fn=
|
272 |
outputs=transcription_textbox
|
273 |
)
|
274 |
|
|
|
203 |
tokenizer=processor.tokenizer,
|
204 |
feature_extractor=processor.feature_extractor,
|
205 |
max_new_tokens=128,
|
206 |
+
chunk_length_s=5, # Process audio in 5-second chunks
|
207 |
batch_size=16,
|
208 |
torch_dtype=torch_dtype,
|
209 |
device=device,
|
210 |
+
return_timestamps=False
|
211 |
)
|
212 |
|
213 |
+
# Function to process audio in real-time and update the transcription
|
214 |
+
def transcribe_audio_real_time(audio):
|
|
|
|
|
|
|
215 |
try:
|
|
|
216 |
sr, y = audio
|
217 |
y = y.astype(np.float32)
|
218 |
y = y / np.max(np.abs(y)) # Normalize audio to range [-1.0, 1.0]
|
219 |
|
220 |
+
# Process the audio data with Whisper ASR in chunks
|
|
|
|
|
|
|
221 |
result = pipe_asr({"array": y, "sampling_rate": sr}, return_timestamps=False)
|
222 |
+
transcription = result.get("text", "")
|
|
|
|
|
|
|
|
|
|
|
223 |
|
224 |
+
logging.debug(f"Real-time transcription: {transcription}")
|
225 |
+
return transcription
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
except Exception as e:
|
227 |
+
logging.error(f"Error during real-time transcription: {e}")
|
228 |
+
return "Error processing the audio, please try again."
|
229 |
+
|
230 |
|
231 |
# Function to clear the transcription state
|
232 |
def clear_state():
|
|
|
235 |
# Define the Gradio interface
|
236 |
with gr.Blocks(theme="rawrsor1/Everforest") as demo:
|
237 |
audio_input = gr.Audio(sources=["microphone"], type='numpy', streaming=True, label="Speak to Ask")
|
238 |
+
transcription_textbox = gr.Textbox(label="Transcription", interactive=False)
|
239 |
submit_voice_btn = gr.Button("Submit Voice")
|
240 |
clear_state_btn = gr.Button("Clear State")
|
|
|
241 |
audio_output = gr.Audio(label="Response Audio", type="filepath", autoplay=True, interactive=False)
|
242 |
|
243 |
+
# Update the transcription text in real-time as the user speaks
|
244 |
+
audio_input.stream(
|
245 |
+
fn=transcribe_audio_real_time,
|
246 |
inputs=audio_input,
|
247 |
+
outputs=transcription_textbox
|
248 |
+
)
|
249 |
+
|
250 |
+
# Define a placeholder function for handling submission
|
251 |
+
def handle_submit(text):
|
252 |
+
# Placeholder function, could trigger response generation or other actions
|
253 |
+
return f"You submitted: {text}"
|
254 |
+
|
255 |
+
# Handle the submission of the final transcribed text
|
256 |
+
submit_voice_btn.click(
|
257 |
+
fn=handle_submit,
|
258 |
+
inputs=transcription_textbox,
|
259 |
+
outputs=transcription_textbox
|
260 |
)
|
261 |
|
262 |
# Interaction for Clear State Button
|
263 |
clear_state_btn.click(
|
264 |
+
fn=lambda: "",
|
265 |
outputs=transcription_textbox
|
266 |
)
|
267 |
|