Spaces:
Running
on
Zero
Running
on
Zero
remove audio debug zone
Browse files
app.py
CHANGED
@@ -135,34 +135,15 @@ def transcribe_audio(audio_input):
|
|
135 |
# ------------------- Main Processing Function -------------------
|
136 |
@spaces.GPU # Decorate to run on GPU when processing
|
137 |
def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
|
138 |
-
# Determine input
|
139 |
if mode == "Microphone" and mic_audio is not None:
|
140 |
-
|
141 |
-
transcription = transcribe_audio(chosen_audio)
|
142 |
elif mode == "Text" and text_input:
|
143 |
transcription = text_input
|
144 |
-
chosen_audio = None
|
145 |
elif mode == "File" and file_audio is not None:
|
146 |
-
|
147 |
-
transcription = transcribe_audio(chosen_audio)
|
148 |
else:
|
149 |
-
return "請提供語音或文字輸入", "", None
|
150 |
-
|
151 |
-
# For debugging: prepare debug audio.
|
152 |
-
debug_audio = None
|
153 |
-
if chosen_audio is not None:
|
154 |
-
if isinstance(chosen_audio, str):
|
155 |
-
# For file input, read using soundfile to get raw audio.
|
156 |
-
audio_array, sample_rate = sf.read(chosen_audio)
|
157 |
-
if audio_array.ndim > 1:
|
158 |
-
audio_array = np.mean(audio_array, axis=-1)
|
159 |
-
debug_audio = (sample_rate, audio_array)
|
160 |
-
elif isinstance(chosen_audio, tuple):
|
161 |
-
audio_array = chosen_audio[1]
|
162 |
-
sample_rate = chosen_audio[0]
|
163 |
-
if audio_array.ndim > 1:
|
164 |
-
audio_array = np.mean(audio_array, axis=-1)
|
165 |
-
debug_audio = (sample_rate, audio_array)
|
166 |
|
167 |
# Classify the transcribed or provided text.
|
168 |
if available_models[model_choice] == "qwen":
|
@@ -172,7 +153,7 @@ def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
|
|
172 |
# Generate TTS message and corresponding audio.
|
173 |
tts_msg = get_tts_message(classification)
|
174 |
tts_audio = tts_audio_output(tts_msg)
|
175 |
-
return transcription, classification, tts_audio
|
176 |
|
177 |
# ------------------- Gradio Blocks Interface Setup -------------------
|
178 |
with gr.Blocks() as demo:
|
@@ -187,7 +168,7 @@ with gr.Blocks() as demo:
|
|
187 |
# Three input components: microphone, text, and file upload.
|
188 |
mic_audio = gr.Audio(sources=["microphone"], type="numpy", label="語音輸入 (點擊錄音)")
|
189 |
text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
|
190 |
-
# For file input, use 'filepath' so Whisper pipeline
|
191 |
file_audio = gr.Audio(sources=["upload"], type="filepath", label="上傳語音檔案")
|
192 |
|
193 |
# Initially, only the microphone input is visible.
|
@@ -201,7 +182,7 @@ with gr.Blocks() as demo:
|
|
201 |
elif selected_mode == "Text":
|
202 |
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
203 |
else: # File
|
204 |
-
return gr.update(visible=False), gr.update(visible
|
205 |
mode.change(fn=update_visibility, inputs=mode, outputs=[mic_audio, text_input, file_audio])
|
206 |
|
207 |
with gr.Row():
|
@@ -217,12 +198,10 @@ with gr.Blocks() as demo:
|
|
217 |
classification_output = gr.Textbox(label="意圖判斷結果")
|
218 |
with gr.Row():
|
219 |
tts_output = gr.Audio(type="numpy", label="TTS 語音輸出")
|
220 |
-
with gr.Row():
|
221 |
-
debug_audio_output = gr.Audio(type="numpy", label="Debug: 傳送到 Whisper Pipeline 的音訊")
|
222 |
|
223 |
# Button event triggers the classification.
|
224 |
classify_btn.click(fn=classify_intent,
|
225 |
inputs=[mode, mic_audio, text_input, file_audio, model_dropdown],
|
226 |
-
outputs=[transcription_output, classification_output, tts_output
|
227 |
|
228 |
demo.launch()
|
|
|
135 |
# ------------------- Main Processing Function -------------------
|
136 |
@spaces.GPU # Decorate to run on GPU when processing
|
137 |
def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
|
138 |
+
# Determine input based on selected mode.
|
139 |
if mode == "Microphone" and mic_audio is not None:
|
140 |
+
transcription = transcribe_audio(mic_audio)
|
|
|
141 |
elif mode == "Text" and text_input:
|
142 |
transcription = text_input
|
|
|
143 |
elif mode == "File" and file_audio is not None:
|
144 |
+
transcription = transcribe_audio(file_audio)
|
|
|
145 |
else:
|
146 |
+
return "請提供語音或文字輸入", "", None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
# Classify the transcribed or provided text.
|
149 |
if available_models[model_choice] == "qwen":
|
|
|
153 |
# Generate TTS message and corresponding audio.
|
154 |
tts_msg = get_tts_message(classification)
|
155 |
tts_audio = tts_audio_output(tts_msg)
|
156 |
+
return transcription, classification, tts_audio
|
157 |
|
158 |
# ------------------- Gradio Blocks Interface Setup -------------------
|
159 |
with gr.Blocks() as demo:
|
|
|
168 |
# Three input components: microphone, text, and file upload.
|
169 |
mic_audio = gr.Audio(sources=["microphone"], type="numpy", label="語音輸入 (點擊錄音)")
|
170 |
text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
|
171 |
+
# For file input, use 'filepath' so Whisper pipeline handles conversion.
|
172 |
file_audio = gr.Audio(sources=["upload"], type="filepath", label="上傳語音檔案")
|
173 |
|
174 |
# Initially, only the microphone input is visible.
|
|
|
182 |
elif selected_mode == "Text":
|
183 |
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
184 |
else: # File
|
185 |
+
return gr.update(visible=False), gr.update(visible(False)), gr.update(visible=True)
|
186 |
mode.change(fn=update_visibility, inputs=mode, outputs=[mic_audio, text_input, file_audio])
|
187 |
|
188 |
with gr.Row():
|
|
|
198 |
classification_output = gr.Textbox(label="意圖判斷結果")
|
199 |
with gr.Row():
|
200 |
tts_output = gr.Audio(type="numpy", label="TTS 語音輸出")
|
|
|
|
|
201 |
|
202 |
# Button event triggers the classification.
|
203 |
classify_btn.click(fn=classify_intent,
|
204 |
inputs=[mode, mic_audio, text_input, file_audio, model_dropdown],
|
205 |
+
outputs=[transcription_output, classification_output, tts_output])
|
206 |
|
207 |
demo.launch()
|