Pijush2023 commited on
Commit
739e317
·
verified ·
1 Parent(s): 49d592e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -22
app.py CHANGED
@@ -210,31 +210,41 @@ pipe_asr = pipeline(
210
  return_timestamps=False
211
  )
212
 
213
- # Function to process audio in real-time and update the transcription
214
- def transcribe_audio_real_time(audio):
215
  try:
216
- sr, y = audio
217
- y = y.astype(np.float32)
218
- y = y / np.max(np.abs(y)) # Normalize audio to range [-1.0, 1.0]
219
-
220
- # Process the audio data with Whisper ASR in chunks
221
- result = pipe_asr({"array": y, "sampling_rate": sr}, return_timestamps=False)
222
- transcription = result.get("text", "")
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
- logging.debug(f"Real-time transcription: {transcription}")
225
- return transcription
226
- except Exception as e:
227
- logging.error(f"Error during real-time transcription: {e}")
228
- return "Error processing the audio, please try again."
229
 
 
 
230
 
231
- # Function to clear the transcription state
232
- def clear_state():
233
- return ""
234
 
235
  # Define the Gradio interface
236
  with gr.Blocks(theme="rawrsor1/Everforest") as demo:
237
- audio_input = gr.Audio(sources=["microphone"], type='numpy', streaming=False, label="Speak to Ask")
238
  transcription_textbox = gr.Textbox(label="Transcription", interactive=False)
239
  submit_voice_btn = gr.Button("Submit Voice")
240
  clear_state_btn = gr.Button("Clear State")
@@ -242,12 +252,12 @@ with gr.Blocks(theme="rawrsor1/Everforest") as demo:
242
 
243
  # Update the transcription text in real-time as the user speaks
244
  audio_input.stream(
245
- fn=transcribe_audio_real_time,
246
- inputs=audio_input,
247
- outputs=transcription_textbox
248
  )
249
 
250
- # Define a placeholder function for handling submission
251
  def handle_submit(text):
252
  # Placeholder function, could trigger response generation or other actions
253
  return f"You submitted: {text}"
 
210
  return_timestamps=False
211
  )
212
 
213
+ # Function to handle audio transcription in real-time
214
+ def transcribe_function(stream, new_chunk):
215
  try:
216
+ sr, y = new_chunk[0], new_chunk[1]
217
+ except TypeError:
218
+ print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
219
+ return stream, "", None
220
+
221
+ # Ensure y is not empty and is at least 1-dimensional
222
+ if y is None or len(y) == 0:
223
+ return stream, "", None
224
+
225
+ y = y.astype(np.float32)
226
+ max_abs_y = np.max(np.abs(y))
227
+ if max_abs_y > 0:
228
+ y = y / max_abs_y
229
+
230
+ # Ensure stream is also at least 1-dimensional before concatenation
231
+ if stream is not None and len(stream) > 0:
232
+ stream = np.concatenate([stream, y])
233
+ else:
234
+ stream = y
235
 
236
+ # Process the audio data for transcription
237
+ result = pipe_asr({"array": stream, "sampling_rate": sr}, return_timestamps=False)
238
+ full_text = result.get("text", "")
 
 
239
 
240
+ # Start a thread to reset the state after 10 seconds
241
+ threading.Thread(target=auto_reset_state).start()
242
 
243
+ return stream, full_text, full_text
 
 
244
 
245
  # Define the Gradio interface
246
  with gr.Blocks(theme="rawrsor1/Everforest") as demo:
247
+ audio_input = gr.Audio(sources=["microphone"], type='numpy', streaming=True, label="Speak to Ask")
248
  transcription_textbox = gr.Textbox(label="Transcription", interactive=False)
249
  submit_voice_btn = gr.Button("Submit Voice")
250
  clear_state_btn = gr.Button("Clear State")
 
252
 
253
  # Update the transcription text in real-time as the user speaks
254
  audio_input.stream(
255
+ fn=transcribe_function,
256
+ inputs=[None, audio_input],
257
+ outputs=[None, transcription_textbox, transcription_textbox]
258
  )
259
 
260
+ # Placeholder function for handling submission
261
  def handle_submit(text):
262
  # Placeholder function, could trigger response generation or other actions
263
  return f"You submitted: {text}"