Luigi commited on
Commit
b824b83
·
1 Parent(s): b36759a

remove audio debug zone

Browse files
Files changed (1) hide show
  1. app.py +8 -29
app.py CHANGED
@@ -135,34 +135,15 @@ def transcribe_audio(audio_input):
135
  # ------------------- Main Processing Function -------------------
136
  @spaces.GPU # Decorate to run on GPU when processing
137
  def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
138
- # Determine input and choose appropriately.
139
  if mode == "Microphone" and mic_audio is not None:
140
- chosen_audio = mic_audio
141
- transcription = transcribe_audio(chosen_audio)
142
  elif mode == "Text" and text_input:
143
  transcription = text_input
144
- chosen_audio = None
145
  elif mode == "File" and file_audio is not None:
146
- chosen_audio = file_audio
147
- transcription = transcribe_audio(chosen_audio)
148
  else:
149
- return "請提供語音或文字輸入", "", None, None
150
-
151
- # For debugging: prepare debug audio.
152
- debug_audio = None
153
- if chosen_audio is not None:
154
- if isinstance(chosen_audio, str):
155
- # For file input, read using soundfile to get raw audio.
156
- audio_array, sample_rate = sf.read(chosen_audio)
157
- if audio_array.ndim > 1:
158
- audio_array = np.mean(audio_array, axis=-1)
159
- debug_audio = (sample_rate, audio_array)
160
- elif isinstance(chosen_audio, tuple):
161
- audio_array = chosen_audio[1]
162
- sample_rate = chosen_audio[0]
163
- if audio_array.ndim > 1:
164
- audio_array = np.mean(audio_array, axis=-1)
165
- debug_audio = (sample_rate, audio_array)
166
 
167
  # Classify the transcribed or provided text.
168
  if available_models[model_choice] == "qwen":
@@ -172,7 +153,7 @@ def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
172
  # Generate TTS message and corresponding audio.
173
  tts_msg = get_tts_message(classification)
174
  tts_audio = tts_audio_output(tts_msg)
175
- return transcription, classification, tts_audio, debug_audio
176
 
177
  # ------------------- Gradio Blocks Interface Setup -------------------
178
  with gr.Blocks() as demo:
@@ -187,7 +168,7 @@ with gr.Blocks() as demo:
187
  # Three input components: microphone, text, and file upload.
188
  mic_audio = gr.Audio(sources=["microphone"], type="numpy", label="語音輸入 (點擊錄音)")
189
  text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
190
- # For file input, use 'filepath' so Whisper pipeline gets the file and can convert it internally.
191
  file_audio = gr.Audio(sources=["upload"], type="filepath", label="上傳語音檔案")
192
 
193
  # Initially, only the microphone input is visible.
@@ -201,7 +182,7 @@ with gr.Blocks() as demo:
201
  elif selected_mode == "Text":
202
  return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
203
  else: # File
204
- return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
205
  mode.change(fn=update_visibility, inputs=mode, outputs=[mic_audio, text_input, file_audio])
206
 
207
  with gr.Row():
@@ -217,12 +198,10 @@ with gr.Blocks() as demo:
217
  classification_output = gr.Textbox(label="意圖判斷結果")
218
  with gr.Row():
219
  tts_output = gr.Audio(type="numpy", label="TTS 語音輸出")
220
- with gr.Row():
221
- debug_audio_output = gr.Audio(type="numpy", label="Debug: 傳送到 Whisper Pipeline 的音訊")
222
 
223
  # Button event triggers the classification.
224
  classify_btn.click(fn=classify_intent,
225
  inputs=[mode, mic_audio, text_input, file_audio, model_dropdown],
226
- outputs=[transcription_output, classification_output, tts_output, debug_audio_output])
227
 
228
  demo.launch()
 
135
  # ------------------- Main Processing Function -------------------
136
  @spaces.GPU # Decorate to run on GPU when processing
137
  def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
138
+ # Determine input based on selected mode.
139
  if mode == "Microphone" and mic_audio is not None:
140
+ transcription = transcribe_audio(mic_audio)
 
141
  elif mode == "Text" and text_input:
142
  transcription = text_input
 
143
  elif mode == "File" and file_audio is not None:
144
+ transcription = transcribe_audio(file_audio)
 
145
  else:
146
+ return "請提供語音或文字輸入", "", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  # Classify the transcribed or provided text.
149
  if available_models[model_choice] == "qwen":
 
153
  # Generate TTS message and corresponding audio.
154
  tts_msg = get_tts_message(classification)
155
  tts_audio = tts_audio_output(tts_msg)
156
+ return transcription, classification, tts_audio
157
 
158
  # ------------------- Gradio Blocks Interface Setup -------------------
159
  with gr.Blocks() as demo:
 
168
  # Three input components: microphone, text, and file upload.
169
  mic_audio = gr.Audio(sources=["microphone"], type="numpy", label="語音輸入 (點擊錄音)")
170
  text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
171
+ # For file input, use 'filepath' so Whisper pipeline handles conversion.
172
  file_audio = gr.Audio(sources=["upload"], type="filepath", label="上傳語音檔案")
173
 
174
  # Initially, only the microphone input is visible.
 
182
  elif selected_mode == "Text":
183
  return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
184
  else: # File
185
+ return gr.update(visible=False), gr.update(visible(False)), gr.update(visible=True)
186
  mode.change(fn=update_visibility, inputs=mode, outputs=[mic_audio, text_input, file_audio])
187
 
188
  with gr.Row():
 
198
  classification_output = gr.Textbox(label="意圖判斷結果")
199
  with gr.Row():
200
  tts_output = gr.Audio(type="numpy", label="TTS 語音輸出")
 
 
201
 
202
  # Button event triggers the classification.
203
  classify_btn.click(fn=classify_intent,
204
  inputs=[mode, mic_audio, text_input, file_audio, model_dropdown],
205
+ outputs=[transcription_output, classification_output, tts_output])
206
 
207
  demo.launch()