Pijush2023 commited on
Commit
58f24a5
·
verified ·
1 Parent(s): d8e6689

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -36
app.py CHANGED
@@ -203,49 +203,30 @@ pipe_asr = pipeline(
203
  tokenizer=processor.tokenizer,
204
  feature_extractor=processor.feature_extractor,
205
  max_new_tokens=128,
206
- chunk_length_s=15,
207
  batch_size=16,
208
  torch_dtype=torch_dtype,
209
  device=device,
210
- return_timestamps=True
211
  )
212
 
213
- # Set up logging to a file to capture debug information
214
- logging.basicConfig(filename='neo4j_retrieval.log', level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
215
-
216
- # Function to handle voice input, generate response from Neo4j, and return audio output
217
- def handle_voice_to_voice(audio):
218
  try:
219
- # Transcribe audio input to text
220
  sr, y = audio
221
  y = y.astype(np.float32)
222
  y = y / np.max(np.abs(y)) # Normalize audio to range [-1.0, 1.0]
223
 
224
- logging.debug(f"Audio sample rate: {sr}")
225
- logging.debug(f"Audio data: {y[:100]}") # Log first 100 samples for brevity
226
-
227
- # Process the audio data with Whisper ASR
228
  result = pipe_asr({"array": y, "sampling_rate": sr}, return_timestamps=False)
229
- question = result.get("text", "")
230
-
231
- logging.debug(f"Transcribed question: {question}")
232
-
233
- if not question:
234
- return "No transcription available, please try again.", ""
235
 
236
- # Get response using the transcribed question
237
- response = get_response(question)
238
- logging.debug(f"Response from Neo4j and GPT: {response}")
239
-
240
- # Generate audio from the response
241
- audio_path = generate_audio_elevenlabs(response)
242
- logging.debug(f"Generated audio path: {audio_path}")
243
-
244
- # Return the transcription and the audio path
245
- return audio_path, question
246
  except Exception as e:
247
- logging.error(f"Error in handle_voice_to_voice: {e}")
248
- return "Error processing the audio, please try again.", ""
 
249
 
250
  # Function to clear the transcription state
251
  def clear_state():
@@ -254,21 +235,33 @@ def clear_state():
254
  # Define the Gradio interface
255
  with gr.Blocks(theme="rawrsor1/Everforest") as demo:
256
  audio_input = gr.Audio(sources=["microphone"], type='numpy', streaming=True, label="Speak to Ask")
 
257
  submit_voice_btn = gr.Button("Submit Voice")
258
  clear_state_btn = gr.Button("Clear State")
259
- transcription_textbox = gr.Textbox(label="Transcription", interactive=False)
260
  audio_output = gr.Audio(label="Response Audio", type="filepath", autoplay=True, interactive=False)
261
 
262
- # Interactions for Submit Voice Button
263
- submit_voice_btn.click(
264
- fn=handle_voice_to_voice,
265
  inputs=audio_input,
266
- outputs=[audio_output, transcription_textbox]
 
 
 
 
 
 
 
 
 
 
 
 
267
  )
268
 
269
  # Interaction for Clear State Button
270
  clear_state_btn.click(
271
- fn=clear_state,
272
  outputs=transcription_textbox
273
  )
274
 
 
203
  tokenizer=processor.tokenizer,
204
  feature_extractor=processor.feature_extractor,
205
  max_new_tokens=128,
206
+ chunk_length_s=5, # Process audio in 5-second chunks
207
  batch_size=16,
208
  torch_dtype=torch_dtype,
209
  device=device,
210
+ return_timestamps=False
211
  )
212
 
213
+ # Function to process audio in real-time and update the transcription
214
+ def transcribe_audio_real_time(audio):
 
 
 
215
  try:
 
216
  sr, y = audio
217
  y = y.astype(np.float32)
218
  y = y / np.max(np.abs(y)) # Normalize audio to range [-1.0, 1.0]
219
 
220
+ # Process the audio data with Whisper ASR in chunks
 
 
 
221
  result = pipe_asr({"array": y, "sampling_rate": sr}, return_timestamps=False)
222
+ transcription = result.get("text", "")
 
 
 
 
 
223
 
224
+ logging.debug(f"Real-time transcription: {transcription}")
225
+ return transcription
 
 
 
 
 
 
 
 
226
  except Exception as e:
227
+ logging.error(f"Error during real-time transcription: {e}")
228
+ return "Error processing the audio, please try again."
229
+
230
 
231
  # Function to clear the transcription state
232
  def clear_state():
 
235
  # Define the Gradio interface
236
  with gr.Blocks(theme="rawrsor1/Everforest") as demo:
237
  audio_input = gr.Audio(sources=["microphone"], type='numpy', streaming=True, label="Speak to Ask")
238
+ transcription_textbox = gr.Textbox(label="Transcription", interactive=False)
239
  submit_voice_btn = gr.Button("Submit Voice")
240
  clear_state_btn = gr.Button("Clear State")
 
241
  audio_output = gr.Audio(label="Response Audio", type="filepath", autoplay=True, interactive=False)
242
 
243
+ # Update the transcription text in real-time as the user speaks
244
+ audio_input.stream(
245
+ fn=transcribe_audio_real_time,
246
  inputs=audio_input,
247
+ outputs=transcription_textbox
248
+ )
249
+
250
+ # Define a placeholder function for handling submission
251
+ def handle_submit(text):
252
+ # Placeholder function, could trigger response generation or other actions
253
+ return f"You submitted: {text}"
254
+
255
+ # Handle the submission of the final transcribed text
256
+ submit_voice_btn.click(
257
+ fn=handle_submit,
258
+ inputs=transcription_textbox,
259
+ outputs=transcription_textbox
260
  )
261
 
262
  # Interaction for Clear State Button
263
  clear_state_btn.click(
264
+ fn=lambda: "",
265
  outputs=transcription_textbox
266
  )
267