Spaces:

sagar007
/

Multimodal_App

Build error

sagar007 commited on Aug 24, 2024

Commit

7d4688b

verified ·

1 Parent(s): bb4d7fc

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -110,8 +110,12 @@ def process_image_input(image, vision_model, processor):
         return "Error: Vision model is not available."
     try:
         # Process the image
-        image = Image.open(io.BytesIO(image)).convert('RGB')
         inputs = processor(images=image, return_tensors="pt").to(vision_model.device)
         # Generate text
@@ -129,6 +133,7 @@ def process_image_input(image, vision_model, processor):
         return generated_text
     except Exception as e:
         return f"Error processing image: {str(e)}"
 # Generate response within a GPU-decorated function
 @spaces.GPU
 def generate_response(transcription, sarvam_pipe):
@@ -192,6 +197,7 @@ def indic_vision_assistant(input_type, audio_input, text_input, image_input):
         elif input_type == "text" and text_input:
             transcription = text_input
         elif input_type == "image" and image_input is not None:
             transcription = process_image_input(image_input, vision_model, processor)
         else:
             return "Please provide either audio, text, or image input.", "No input provided.", None

         return "Error: Vision model is not available."
     try:
+        # Check if image is already a PIL Image
+        if not isinstance(image, Image.Image):
+            # If it's not, assume it's a file path or bytes and open it
+            image = Image.open(image).convert('RGB')
         # Process the image
         inputs = processor(images=image, return_tensors="pt").to(vision_model.device)
         # Generate text
         return generated_text
     except Exception as e:
         return f"Error processing image: {str(e)}"
 # Generate response within a GPU-decorated function
 @spaces.GPU
 def generate_response(transcription, sarvam_pipe):
         elif input_type == "text" and text_input:
             transcription = text_input
         elif input_type == "image" and image_input is not None:
+            # Directly pass the image_input to process_image_input
             transcription = process_image_input(image_input, vision_model, processor)
         else:
             return "Please provide either audio, text, or image input.", "No input provided.", None