Spaces:

Luigi
/

dinercall-intent-demo

Running on Zero

App Files Files Community

Luigi commited on Apr 12

Commit

b824b83

1 Parent(s): b36759a

remove audio debug zone

Browse files

Files changed (1) hide show

app.py +8 -29

app.py CHANGED Viewed

@@ -135,34 +135,15 @@ def transcribe_audio(audio_input):
 # ------------------- Main Processing Function -------------------
 @spaces.GPU  # Decorate to run on GPU when processing
 def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
-    # Determine input and choose appropriately.
     if mode == "Microphone" and mic_audio is not None:
-        chosen_audio = mic_audio
-        transcription = transcribe_audio(chosen_audio)
     elif mode == "Text" and text_input:
         transcription = text_input
-        chosen_audio = None
     elif mode == "File" and file_audio is not None:
-        chosen_audio = file_audio
-        transcription = transcribe_audio(chosen_audio)
     else:
-        return "請提供語音或文字輸入", "", None, None
-    # For debugging: prepare debug audio.
-    debug_audio = None
-    if chosen_audio is not None:
-        if isinstance(chosen_audio, str):
-            # For file input, read using soundfile to get raw audio.
-            audio_array, sample_rate = sf.read(chosen_audio)
-            if audio_array.ndim > 1:
-                audio_array = np.mean(audio_array, axis=-1)
-            debug_audio = (sample_rate, audio_array)
-        elif isinstance(chosen_audio, tuple):
-            audio_array = chosen_audio[1]
-            sample_rate = chosen_audio[0]
-            if audio_array.ndim > 1:
-                audio_array = np.mean(audio_array, axis=-1)
-            debug_audio = (sample_rate, audio_array)
     # Classify the transcribed or provided text.
     if available_models[model_choice] == "qwen":
@@ -172,7 +153,7 @@ def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
     # Generate TTS message and corresponding audio.
     tts_msg = get_tts_message(classification)
     tts_audio = tts_audio_output(tts_msg)
-    return transcription, classification, tts_audio, debug_audio
 # ------------------- Gradio Blocks Interface Setup -------------------
 with gr.Blocks() as demo:
@@ -187,7 +168,7 @@ with gr.Blocks() as demo:
         # Three input components: microphone, text, and file upload.
         mic_audio = gr.Audio(sources=["microphone"], type="numpy", label="語音輸入 (點擊錄音)")
         text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
-        # For file input, use 'filepath' so Whisper pipeline gets the file and can convert it internally.
         file_audio = gr.Audio(sources=["upload"], type="filepath", label="上傳語音檔案")
     # Initially, only the microphone input is visible.
@@ -201,7 +182,7 @@ with gr.Blocks() as demo:
         elif selected_mode == "Text":
             return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
         else:  # File
-            return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
     mode.change(fn=update_visibility, inputs=mode, outputs=[mic_audio, text_input, file_audio])
     with gr.Row():
@@ -217,12 +198,10 @@ with gr.Blocks() as demo:
         classification_output = gr.Textbox(label="意圖判斷結果")
     with gr.Row():
         tts_output = gr.Audio(type="numpy", label="TTS 語音輸出")
-    with gr.Row():
-        debug_audio_output = gr.Audio(type="numpy", label="Debug: 傳送到 Whisper Pipeline 的音訊")
     # Button event triggers the classification.
     classify_btn.click(fn=classify_intent,
                        inputs=[mode, mic_audio, text_input, file_audio, model_dropdown],
-                       outputs=[transcription_output, classification_output, tts_output, debug_audio_output])
 demo.launch()

 # ------------------- Main Processing Function -------------------
 @spaces.GPU  # Decorate to run on GPU when processing
 def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
+    # Determine input based on selected mode.
     if mode == "Microphone" and mic_audio is not None:
+        transcription = transcribe_audio(mic_audio)
     elif mode == "Text" and text_input:
         transcription = text_input
     elif mode == "File" and file_audio is not None:
+        transcription = transcribe_audio(file_audio)
     else:
+        return "請提供語音或文字輸入", "", None
     # Classify the transcribed or provided text.
     if available_models[model_choice] == "qwen":
     # Generate TTS message and corresponding audio.
     tts_msg = get_tts_message(classification)
     tts_audio = tts_audio_output(tts_msg)
+    return transcription, classification, tts_audio
 # ------------------- Gradio Blocks Interface Setup -------------------
 with gr.Blocks() as demo:
         # Three input components: microphone, text, and file upload.
         mic_audio = gr.Audio(sources=["microphone"], type="numpy", label="語音輸入 (點擊錄音)")
         text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
+        # For file input, use 'filepath' so Whisper pipeline handles conversion.
         file_audio = gr.Audio(sources=["upload"], type="filepath", label="上傳語音檔案")
     # Initially, only the microphone input is visible.
         elif selected_mode == "Text":
             return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
         else:  # File
+            return gr.update(visible=False), gr.update(visible(False)), gr.update(visible=True)
     mode.change(fn=update_visibility, inputs=mode, outputs=[mic_audio, text_input, file_audio])
     with gr.Row():
         classification_output = gr.Textbox(label="意圖判斷結果")
     with gr.Row():
         tts_output = gr.Audio(type="numpy", label="TTS 語音輸出")
     # Button event triggers the classification.
     classify_btn.click(fn=classify_intent,
                        inputs=[mode, mic_audio, text_input, file_audio, model_dropdown],
+                       outputs=[transcription_output, classification_output, tts_output])
 demo.launch()