Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -110,8 +110,12 @@ def process_image_input(image, vision_model, processor):
|
|
110 |
return "Error: Vision model is not available."
|
111 |
|
112 |
try:
|
|
|
|
|
|
|
|
|
|
|
113 |
# Process the image
|
114 |
-
image = Image.open(io.BytesIO(image)).convert('RGB')
|
115 |
inputs = processor(images=image, return_tensors="pt").to(vision_model.device)
|
116 |
|
117 |
# Generate text
|
@@ -129,6 +133,7 @@ def process_image_input(image, vision_model, processor):
|
|
129 |
return generated_text
|
130 |
except Exception as e:
|
131 |
return f"Error processing image: {str(e)}"
|
|
|
132 |
# Generate response within a GPU-decorated function
|
133 |
@spaces.GPU
|
134 |
def generate_response(transcription, sarvam_pipe):
|
@@ -192,6 +197,7 @@ def indic_vision_assistant(input_type, audio_input, text_input, image_input):
|
|
192 |
elif input_type == "text" and text_input:
|
193 |
transcription = text_input
|
194 |
elif input_type == "image" and image_input is not None:
|
|
|
195 |
transcription = process_image_input(image_input, vision_model, processor)
|
196 |
else:
|
197 |
return "Please provide either audio, text, or image input.", "No input provided.", None
|
|
|
110 |
return "Error: Vision model is not available."
|
111 |
|
112 |
try:
|
113 |
+
# Check if image is already a PIL Image
|
114 |
+
if not isinstance(image, Image.Image):
|
115 |
+
# If it's not, assume it's a file path or bytes and open it
|
116 |
+
image = Image.open(image).convert('RGB')
|
117 |
+
|
118 |
# Process the image
|
|
|
119 |
inputs = processor(images=image, return_tensors="pt").to(vision_model.device)
|
120 |
|
121 |
# Generate text
|
|
|
133 |
return generated_text
|
134 |
except Exception as e:
|
135 |
return f"Error processing image: {str(e)}"
|
136 |
+
|
137 |
# Generate response within a GPU-decorated function
|
138 |
@spaces.GPU
|
139 |
def generate_response(transcription, sarvam_pipe):
|
|
|
197 |
elif input_type == "text" and text_input:
|
198 |
transcription = text_input
|
199 |
elif input_type == "image" and image_input is not None:
|
200 |
+
# Directly pass the image_input to process_image_input
|
201 |
transcription = process_image_input(image_input, vision_model, processor)
|
202 |
else:
|
203 |
return "Please provide either audio, text, or image input.", "No input provided.", None
|