Spaces:

sagar007
/

Multimodal_App

Build error

sagar007 commited on Aug 25, 2024

Commit

d880060

verified ·

1 Parent(s): ae87ff0

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -85,7 +85,11 @@ def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_t
     buffer = ""
     for new_text in streamer:
         buffer += new_text
-        yield history + [[message, buffer]]
 @spaces.GPU
 def process_vision_query(image, text_input):
@@ -112,7 +116,7 @@ def generate_speech(prompt, description):
     generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
     audio_arr = generation.cpu().numpy().squeeze()
-    output_path = "output_audio.wav"
     sf.write(output_path, audio_arr, tts_model.config.sampling_rate)
     return output_path
@@ -189,9 +193,12 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
         submit_btn = gr.Button("Submit", variant="primary")
         clear_btn = gr.Button("Clear Chat", variant="secondary")
-        submit_btn.click(stream_text_chat, [msg, chatbot, system_prompt, temperature, max_new_tokens, top_p, top_k], [chatbot])
-        clear_btn.click(lambda: None, None, chatbot, queue=False)
     with gr.Tab("Vision Model (Phi-3.5-vision)"):
         with gr.Row():

     buffer = ""
     for new_text in streamer:
         buffer += new_text
+        yield history + [[message, buffer]], None  # Yield None for audio initially
+    # Generate speech for the final response
+    audio_path = generate_speech(buffer, "A clear and concise voice reads out the response.")
+    yield history + [[message, buffer]], audio_path
 @spaces.GPU
 def process_vision_query(image, text_input):
     generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
     audio_arr = generation.cpu().numpy().squeeze()
+    output_path = f"output_audio_{hash(prompt)}.wav"
     sf.write(output_path, audio_arr, tts_model.config.sampling_rate)
     return output_path
         submit_btn = gr.Button("Submit", variant="primary")
         clear_btn = gr.Button("Clear Chat", variant="secondary")
+        audio_output = gr.Audio(label="AI Response Audio")
+        submit_btn.click(stream_text_chat,
+                         inputs=[msg, chatbot, system_prompt, temperature, max_new_tokens, top_p, top_k],
+                         outputs=[chatbot, audio_output])
+        clear_btn.click(lambda: (None, None), None, [chatbot, audio_output], queue=False)
     with gr.Tab("Vision Model (Phi-3.5-vision)"):
         with gr.Row():