Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -94,19 +94,33 @@ def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_t
|
|
94 |
@spaces.GPU
|
95 |
def process_vision_query(image, text_input):
|
96 |
prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
|
97 |
-
image = Image.fromarray(image).convert("RGB")
|
98 |
-
inputs = vision_processor(prompt, image, return_tensors="pt").to(device)
|
99 |
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
106 |
|
107 |
-
|
108 |
-
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
@spaces.GPU
|
112 |
def generate_speech(prompt, description):
|
|
|
94 |
@spaces.GPU
|
95 |
def process_vision_query(image, text_input):
|
96 |
prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
|
|
|
|
|
97 |
|
98 |
+
# Check if image is already a PIL Image
|
99 |
+
if isinstance(image, Image.Image):
|
100 |
+
pil_image = image
|
101 |
+
elif isinstance(image, np.ndarray):
|
102 |
+
pil_image = Image.fromarray(image).convert("RGB")
|
103 |
+
else:
|
104 |
+
raise ValueError("Unsupported image type. Expected PIL Image or numpy array.")
|
105 |
+
|
106 |
+
inputs = vision_processor(prompt, pil_image, return_tensors="pt").to(device)
|
107 |
|
108 |
+
try:
|
109 |
+
with torch.no_grad():
|
110 |
+
generate_ids = vision_model.generate(
|
111 |
+
**inputs,
|
112 |
+
max_new_tokens=1000,
|
113 |
+
eos_token_id=vision_processor.tokenizer.eos_token_id
|
114 |
+
)
|
115 |
+
|
116 |
+
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
|
117 |
+
response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
118 |
+
return response
|
119 |
+
except RuntimeError as e:
|
120 |
+
if "CUDA out of memory" in str(e):
|
121 |
+
return "Error: GPU out of memory. Try processing a smaller image or freeing up GPU resources."
|
122 |
+
else:
|
123 |
+
raise e
|
124 |
|
125 |
@spaces.GPU
|
126 |
def generate_speech(prompt, description):
|