Spaces:

sagar007
/

Multimodal_App

Build error

sagar007 commited on Aug 25, 2024

Commit

6f27c43

verified ·

1 Parent(s): 231fb5f

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -80,11 +80,26 @@ def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_t
         buffer += new_text
         yield history + [[message, buffer]]
-@spaces.GPU  # Add this decorator
 def process_vision_query(image, text_input):
     prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
-    image = Image.fromarray(image).convert("RGB")
-    inputs = vision_processor(prompt, image, return_tensors="pt").to(device)
     with torch.no_grad():
         generate_ids = vision_model.generate(

         buffer += new_text
         yield history + [[message, buffer]]
+@spaces.GPU
 def process_vision_query(image, text_input):
     prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
+    # Convert the image to bytes if it's not already
+    if isinstance(image, Image.Image):
+        # If it's a PIL Image, convert to bytes
+        import io
+        img_byte_arr = io.BytesIO()
+        image.save(img_byte_arr, format='PNG')
+        image = img_byte_arr.getvalue()
+    elif isinstance(image, np.ndarray):
+        # If it's a numpy array, convert to PIL Image first, then to bytes
+        image = Image.fromarray(image).convert("RGB")
+        img_byte_arr = io.BytesIO()
+        image.save(img_byte_arr, format='PNG')
+        image = img_byte_arr.getvalue()
+    # Now process the image bytes
+    inputs = vision_processor(prompt, images=image, return_tensors="pt").to(device)
     with torch.no_grad():
         generate_ids = vision_model.generate(