Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -85,7 +85,11 @@ def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_t
|
|
85 |
buffer = ""
|
86 |
for new_text in streamer:
|
87 |
buffer += new_text
|
88 |
-
yield history + [[message, buffer]]
|
|
|
|
|
|
|
|
|
89 |
|
90 |
@spaces.GPU
|
91 |
def process_vision_query(image, text_input):
|
@@ -112,7 +116,7 @@ def generate_speech(prompt, description):
|
|
112 |
generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
113 |
audio_arr = generation.cpu().numpy().squeeze()
|
114 |
|
115 |
-
output_path = "
|
116 |
sf.write(output_path, audio_arr, tts_model.config.sampling_rate)
|
117 |
|
118 |
return output_path
|
@@ -189,9 +193,12 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
|
|
189 |
|
190 |
submit_btn = gr.Button("Submit", variant="primary")
|
191 |
clear_btn = gr.Button("Clear Chat", variant="secondary")
|
|
|
192 |
|
193 |
-
submit_btn.click(stream_text_chat,
|
194 |
-
|
|
|
|
|
195 |
|
196 |
with gr.Tab("Vision Model (Phi-3.5-vision)"):
|
197 |
with gr.Row():
|
|
|
85 |
buffer = ""
|
86 |
for new_text in streamer:
|
87 |
buffer += new_text
|
88 |
+
yield history + [[message, buffer]], None # Yield None for audio initially
|
89 |
+
|
90 |
+
# Generate speech for the final response
|
91 |
+
audio_path = generate_speech(buffer, "A clear and concise voice reads out the response.")
|
92 |
+
yield history + [[message, buffer]], audio_path
|
93 |
|
94 |
@spaces.GPU
|
95 |
def process_vision_query(image, text_input):
|
|
|
116 |
generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
117 |
audio_arr = generation.cpu().numpy().squeeze()
|
118 |
|
119 |
+
output_path = f"output_audio_{hash(prompt)}.wav"
|
120 |
sf.write(output_path, audio_arr, tts_model.config.sampling_rate)
|
121 |
|
122 |
return output_path
|
|
|
193 |
|
194 |
submit_btn = gr.Button("Submit", variant="primary")
|
195 |
clear_btn = gr.Button("Clear Chat", variant="secondary")
|
196 |
+
audio_output = gr.Audio(label="AI Response Audio")
|
197 |
|
198 |
+
submit_btn.click(stream_text_chat,
|
199 |
+
inputs=[msg, chatbot, system_prompt, temperature, max_new_tokens, top_p, top_k],
|
200 |
+
outputs=[chatbot, audio_output])
|
201 |
+
clear_btn.click(lambda: (None, None), None, [chatbot, audio_output], queue=False)
|
202 |
|
203 |
with gr.Tab("Vision Model (Phi-3.5-vision)"):
|
204 |
with gr.Row():
|