Spaces:
Runtime error
Runtime error
update cap
Browse files
app.py
CHANGED
@@ -2,8 +2,8 @@ import os
|
|
2 |
import sys
|
3 |
from pathlib import Path
|
4 |
# os.system("cd transformers && pip install .")
|
5 |
-
os.system("cd multimodal && pip install
|
6 |
-
os.system("cd multimodal/YOLOX && pip install .")
|
7 |
import numpy as np
|
8 |
import torch
|
9 |
from PIL import Image
|
|
|
2 |
import sys
|
3 |
from pathlib import Path
|
4 |
# os.system("cd transformers && pip install .")
|
5 |
+
# os.system("cd multimodal && pip install -e .")
|
6 |
+
# os.system("cd multimodal/YOLOX && pip install .")
|
7 |
import numpy as np
|
8 |
import torch
|
9 |
from PIL import Image
|
multimodal/open_flamingo/chat/conversation.py
CHANGED
@@ -324,7 +324,7 @@ class Chat:
|
|
324 |
repetition_penalty=1.0, length_penalty=1, temperature=1, max_length=2000):
|
325 |
# conv.append_message(conv.roles[1], None)
|
326 |
# embs = self.get_context_emb(conv, img_list)
|
327 |
-
#
|
328 |
# # current_max_len = embs.shape[1] + max_new_tokens + 100
|
329 |
# # begin_idx = max(0, current_max_len - max_length)
|
330 |
# # embs = embs[:, begin_idx:]
|
@@ -494,7 +494,7 @@ class Chat:
|
|
494 |
# if len(image.shape) == 3:
|
495 |
# image = image.unsqueeze(0)
|
496 |
# # image = image.to(self.device)
|
497 |
-
#
|
498 |
# # image_emb, _ = self.model.encode_img(image)
|
499 |
# img_list.append(image_emb)
|
500 |
# conv.append_message(conv.roles[0], "<Img><ImageHere></Img>")
|
@@ -587,4 +587,3 @@ def evaluate_exp(
|
|
587 |
|
588 |
|
589 |
|
590 |
-
|
|
|
324 |
repetition_penalty=1.0, length_penalty=1, temperature=1, max_length=2000):
|
325 |
# conv.append_message(conv.roles[1], None)
|
326 |
# embs = self.get_context_emb(conv, img_list)
|
327 |
+
#
|
328 |
# # current_max_len = embs.shape[1] + max_new_tokens + 100
|
329 |
# # begin_idx = max(0, current_max_len - max_length)
|
330 |
# # embs = embs[:, begin_idx:]
|
|
|
494 |
# if len(image.shape) == 3:
|
495 |
# image = image.unsqueeze(0)
|
496 |
# # image = image.to(self.device)
|
497 |
+
#
|
498 |
# # image_emb, _ = self.model.encode_img(image)
|
499 |
# img_list.append(image_emb)
|
500 |
# conv.append_message(conv.roles[0], "<Img><ImageHere></Img>")
|
|
|
587 |
|
588 |
|
589 |
|
|
multimodal/open_flamingo/eval/task/caption_chat.py
CHANGED
@@ -51,7 +51,8 @@ def prepare_batch_images(batch, image_processor):
|
|
51 |
|
52 |
|
53 |
def captioner(
|
54 |
-
|
|
|
55 |
"""Evaluate a model on COCO dataset.
|
56 |
Returns:
|
57 |
float: CIDEr score
|
@@ -80,7 +81,6 @@ def captioner(
|
|
80 |
input_ids = input_ids
|
81 |
attention_mask = attention_mask
|
82 |
else:
|
83 |
-
|
84 |
encodings = tokenizer(
|
85 |
[prompt],
|
86 |
padding="longest",
|
@@ -93,7 +93,7 @@ def captioner(
|
|
93 |
image_start_index_list = image_start_index_list
|
94 |
image_nums = image_nums
|
95 |
if debug:
|
96 |
-
print("input--->",tokenizer.decode(input_ids[0]))
|
97 |
p1 = MinNewTokensLengthLogitsProcessor(
|
98 |
prompt_length_to_skip=input_ids.shape[-1],
|
99 |
min_new_tokens=5,
|
@@ -114,7 +114,7 @@ def captioner(
|
|
114 |
logits_processor_list=[p1, visual_logits_processor],
|
115 |
)
|
116 |
if debug:
|
117 |
-
print("outputs--->",tokenizer.decode(outputs[0]))
|
118 |
if outputs[0, -2] in [previsual_token_id, visual_token_id] and outputs[0, -1] == bos_token_id:
|
119 |
prompt = tokenizer.decode(outputs.clone()[0])
|
120 |
is_visual = (outputs[0, -2] == visual_token_id)
|
@@ -132,7 +132,7 @@ def captioner(
|
|
132 |
image_start_index_list = [[x] for x in image_start_index_list]
|
133 |
image_nums = [1] * len(input_ids)
|
134 |
if debug:
|
135 |
-
print("get the visual bbox--->",tokenizer.decode(input_ids[0]))
|
136 |
with torch.no_grad():
|
137 |
outputs = model(
|
138 |
vision_x=batch_images,
|
@@ -145,6 +145,8 @@ def captioner(
|
|
145 |
)
|
146 |
boxes = outputs["boxes"]
|
147 |
scores = outputs["scores"]
|
|
|
|
|
148 |
# if not model.valid:
|
149 |
# import pdb; pdb.set_trace()
|
150 |
if boxes is not None:
|
@@ -168,7 +170,8 @@ def captioner(
|
|
168 |
open_cv_image = np.array(image_ori)
|
169 |
open_cv_image = open_cv_image[:, :, ::-1].copy()
|
170 |
for i, pre_box in enumerate(boxes):
|
171 |
-
open_cv_image = cv2.rectangle(open_cv_image, pre_box[:2].astype(int), pre_box[2:].astype(int),
|
|
|
172 |
out_image = Image.fromarray(cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2RGB))
|
173 |
# exit()
|
174 |
pre_box = boxes[scores.argmax()]
|
@@ -181,7 +184,14 @@ def captioner(
|
|
181 |
else:
|
182 |
# if debug:
|
183 |
# import pdb;pdb.set_trace()
|
|
|
|
|
|
|
184 |
prompt = tokenizer.decode(outputs[0, :-2].clone()[0])
|
|
|
|
|
|
|
|
|
185 |
else:
|
186 |
break
|
187 |
outputs = outputs[:, ori_prompt_length:]
|
@@ -190,7 +200,8 @@ def captioner(
|
|
190 |
# postprocess_captioning_generation(out).replace('"', "")
|
191 |
# for out in tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
192 |
# ]
|
193 |
-
|
|
|
194 |
return outputs, out_image
|
195 |
|
196 |
|
@@ -428,5 +439,4 @@ def evaluate_coco_flickr(
|
|
428 |
metrics = {}
|
429 |
metrics["CIDEr"] = 0.0
|
430 |
|
431 |
-
|
432 |
return metrics["CIDEr"]
|
|
|
51 |
|
52 |
|
53 |
def captioner(
|
54 |
+
model, tokenizer, image_ori, batch_images, input_ids, attention_mask, image_start_index_list, image_nums,
|
55 |
+
added_bbox_list, debug=True):
|
56 |
"""Evaluate a model on COCO dataset.
|
57 |
Returns:
|
58 |
float: CIDEr score
|
|
|
81 |
input_ids = input_ids
|
82 |
attention_mask = attention_mask
|
83 |
else:
|
|
|
84 |
encodings = tokenizer(
|
85 |
[prompt],
|
86 |
padding="longest",
|
|
|
93 |
image_start_index_list = image_start_index_list
|
94 |
image_nums = image_nums
|
95 |
if debug:
|
96 |
+
print("input--->", tokenizer.decode(input_ids[0]))
|
97 |
p1 = MinNewTokensLengthLogitsProcessor(
|
98 |
prompt_length_to_skip=input_ids.shape[-1],
|
99 |
min_new_tokens=5,
|
|
|
114 |
logits_processor_list=[p1, visual_logits_processor],
|
115 |
)
|
116 |
if debug:
|
117 |
+
print("outputs--->", tokenizer.decode(outputs[0]))
|
118 |
if outputs[0, -2] in [previsual_token_id, visual_token_id] and outputs[0, -1] == bos_token_id:
|
119 |
prompt = tokenizer.decode(outputs.clone()[0])
|
120 |
is_visual = (outputs[0, -2] == visual_token_id)
|
|
|
132 |
image_start_index_list = [[x] for x in image_start_index_list]
|
133 |
image_nums = [1] * len(input_ids)
|
134 |
if debug:
|
135 |
+
print("get the visual bbox--->", tokenizer.decode(input_ids[0]))
|
136 |
with torch.no_grad():
|
137 |
outputs = model(
|
138 |
vision_x=batch_images,
|
|
|
145 |
)
|
146 |
boxes = outputs["boxes"]
|
147 |
scores = outputs["scores"]
|
148 |
+
if debug:
|
149 |
+
print("box num---->", len(boxes))
|
150 |
# if not model.valid:
|
151 |
# import pdb; pdb.set_trace()
|
152 |
if boxes is not None:
|
|
|
170 |
open_cv_image = np.array(image_ori)
|
171 |
open_cv_image = open_cv_image[:, :, ::-1].copy()
|
172 |
for i, pre_box in enumerate(boxes):
|
173 |
+
open_cv_image = cv2.rectangle(open_cv_image, pre_box[:2].astype(int), pre_box[2:].astype(int),
|
174 |
+
(0, 255, 0), i + 1)
|
175 |
out_image = Image.fromarray(cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2RGB))
|
176 |
# exit()
|
177 |
pre_box = boxes[scores.argmax()]
|
|
|
184 |
else:
|
185 |
# if debug:
|
186 |
# import pdb;pdb.set_trace()
|
187 |
+
prompt = tokenizer.decode(outputs.clone()[0])
|
188 |
+
if debug:
|
189 |
+
print("before else---->", prompt)
|
190 |
prompt = tokenizer.decode(outputs[0, :-2].clone()[0])
|
191 |
+
if debug:
|
192 |
+
print("after else---->", prompt)
|
193 |
+
|
194 |
+
|
195 |
else:
|
196 |
break
|
197 |
outputs = outputs[:, ori_prompt_length:]
|
|
|
200 |
# postprocess_captioning_generation(out).replace('"', "")
|
201 |
# for out in tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
202 |
# ]
|
203 |
+
# import pdb; pdb.set_trace()
|
204 |
+
print("out----------------------------------------------------------------------------------------->")
|
205 |
return outputs, out_image
|
206 |
|
207 |
|
|
|
439 |
metrics = {}
|
440 |
metrics["CIDEr"] = 0.0
|
441 |
|
|
|
442 |
return metrics["CIDEr"]
|