Spaces:

sagar007
/

Multimodal_App

Build error

App Files Files Community

sagar007 commited on Aug 25, 2024

Commit

2b390ac

verified ·

1 Parent(s): c51ef31

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -18

app.py CHANGED Viewed

@@ -92,7 +92,7 @@ def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_t
     yield history + [[message, buffer]], audio_path
 @spaces.GPU
-def process_vision_query(image, text_input):
     prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
     # Check if image is already a PIL Image
@@ -115,26 +115,35 @@ def process_vision_query(image, text_input):
         generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
         response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        return response
     except RuntimeError as e:
         if "CUDA out of memory" in str(e):
-            return "Error: GPU out of memory. Try processing a smaller image or freeing up GPU resources."
         else:
             raise e
-@spaces.GPU
-def generate_speech(prompt, description):
     input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(tts_device)
-    prompt_input_ids = tts_tokenizer(prompt, return_tensors="pt").input_ids.to(tts_device)
-    generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
     audio_arr = generation.cpu().numpy().squeeze()
-    output_path = f"output_audio_{hash(prompt)}.wav"
     sf.write(output_path, audio_arr, tts_model.config.sampling_rate)
     return output_path
 # Custom CSS
 custom_css = """
 body { background-color: #0b0f19; color: #e2e8f0; font-family: 'Arial', sans-serif;}
@@ -214,16 +223,20 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
                          outputs=[chatbot, audio_output])
         clear_btn.click(lambda: (None, None), None, [chatbot, audio_output], queue=False)
-    with gr.Tab("Vision Model (Phi-3.5-vision)"):
-        with gr.Row():
-            with gr.Column(scale=1):
-                vision_input_img = gr.Image(label="Upload an Image", type="pil")
-                vision_text_input = gr.Textbox(label="Ask a question about the image", placeholder="What do you see in this image?")
-                vision_submit_btn = gr.Button("Analyze Image", variant="primary")
-            with gr.Column(scale=1):
-                vision_output_text = gr.Textbox(label="AI Analysis", lines=10)
-        vision_submit_btn.click(process_vision_query, [vision_input_img, vision_text_input], [vision_output_text])
     with gr.Tab("Text-to-Speech (Parler-TTS)"):
         with gr.Row():

     yield history + [[message, buffer]], audio_path
 @spaces.GPU
+def process_vision_query(image, text_input, generate_speech=True):
     prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
     # Check if image is already a PIL Image
         generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
         response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        if generate_speech:
+            audio_path = generate_speech_from_text(response)
+            return response, audio_path
+        else:
+            return response, None
     except RuntimeError as e:
         if "CUDA out of memory" in str(e):
+            error_message = "Error: GPU out of memory. Try processing a smaller image or freeing up GPU resources."
+            return error_message, None
         else:
             raise e
+def generate_speech_from_text(text, description="A clear voice reads out the response."):
     input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(tts_device)
+    prompt_input_ids = tts_tokenizer(text, return_tensors="pt").input_ids.to(tts_device)
+    with torch.no_grad():
+        generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
     audio_arr = generation.cpu().numpy().squeeze()
+    output_path = f"output_audio_{hash(text)}.wav"
     sf.write(output_path, audio_arr, tts_model.config.sampling_rate)
     return output_path
 # Custom CSS
 custom_css = """
 body { background-color: #0b0f19; color: #e2e8f0; font-family: 'Arial', sans-serif;}
                          outputs=[chatbot, audio_output])
         clear_btn.click(lambda: (None, None), None, [chatbot, audio_output], queue=False)
+    with gr.Tab("Vision Model with TTS (Phi-3.5-vision)"):
+    with gr.Row():
+        with gr.Column(scale=1):
+            vision_input_img = gr.Image(label="Upload an Image", type="pil")
+            vision_text_input = gr.Textbox(label="Ask a question about the image", placeholder="What do you see in this image?")
+            vision_submit_btn = gr.Button("Analyze Image and Generate Speech", variant="primary")
+        with gr.Column(scale=1):
+            vision_output_text = gr.Textbox(label="AI Analysis", lines=10)
+            vision_output_audio = gr.Audio(label="Generated Speech")
+    vision_submit_btn.click(process_vision_query,
+                            inputs=[vision_input_img, vision_text_input],
+                            outputs=[vision_output_text, vision_output_audio])
     with gr.Tab("Text-to-Speech (Parler-TTS)"):
         with gr.Row():