chendl commited on
Commit
ed3e57a
·
1 Parent(s): 83aaeb8

update cap

Browse files
app.py CHANGED
@@ -2,8 +2,8 @@ import os
2
  import sys
3
  from pathlib import Path
4
  # os.system("cd transformers && pip install .")
5
- os.system("cd multimodal && pip install .")
6
- os.system("cd multimodal/YOLOX && pip install .")
7
  import numpy as np
8
  import torch
9
  from PIL import Image
 
2
  import sys
3
  from pathlib import Path
4
  # os.system("cd transformers && pip install .")
5
+ # os.system("cd multimodal && pip install -e .")
6
+ # os.system("cd multimodal/YOLOX && pip install .")
7
  import numpy as np
8
  import torch
9
  from PIL import Image
multimodal/open_flamingo/chat/conversation.py CHANGED
@@ -324,7 +324,7 @@ class Chat:
324
  repetition_penalty=1.0, length_penalty=1, temperature=1, max_length=2000):
325
  # conv.append_message(conv.roles[1], None)
326
  # embs = self.get_context_emb(conv, img_list)
327
- #
328
  # # current_max_len = embs.shape[1] + max_new_tokens + 100
329
  # # begin_idx = max(0, current_max_len - max_length)
330
  # # embs = embs[:, begin_idx:]
@@ -494,7 +494,7 @@ class Chat:
494
  # if len(image.shape) == 3:
495
  # image = image.unsqueeze(0)
496
  # # image = image.to(self.device)
497
- #
498
  # # image_emb, _ = self.model.encode_img(image)
499
  # img_list.append(image_emb)
500
  # conv.append_message(conv.roles[0], "<Img><ImageHere></Img>")
@@ -587,4 +587,3 @@ def evaluate_exp(
587
 
588
 
589
 
590
-
 
324
  repetition_penalty=1.0, length_penalty=1, temperature=1, max_length=2000):
325
  # conv.append_message(conv.roles[1], None)
326
  # embs = self.get_context_emb(conv, img_list)
327
+ #
328
  # # current_max_len = embs.shape[1] + max_new_tokens + 100
329
  # # begin_idx = max(0, current_max_len - max_length)
330
  # # embs = embs[:, begin_idx:]
 
494
  # if len(image.shape) == 3:
495
  # image = image.unsqueeze(0)
496
  # # image = image.to(self.device)
497
+ #
498
  # # image_emb, _ = self.model.encode_img(image)
499
  # img_list.append(image_emb)
500
  # conv.append_message(conv.roles[0], "<Img><ImageHere></Img>")
 
587
 
588
 
589
 
 
multimodal/open_flamingo/eval/task/caption_chat.py CHANGED
@@ -51,7 +51,8 @@ def prepare_batch_images(batch, image_processor):
51
 
52
 
53
  def captioner(
54
- model,tokenizer,image_ori,batch_images,input_ids,attention_mask,image_start_index_list,image_nums,added_bbox_list,debug=True):
 
55
  """Evaluate a model on COCO dataset.
56
  Returns:
57
  float: CIDEr score
@@ -80,7 +81,6 @@ def captioner(
80
  input_ids = input_ids
81
  attention_mask = attention_mask
82
  else:
83
-
84
  encodings = tokenizer(
85
  [prompt],
86
  padding="longest",
@@ -93,7 +93,7 @@ def captioner(
93
  image_start_index_list = image_start_index_list
94
  image_nums = image_nums
95
  if debug:
96
- print("input--->",tokenizer.decode(input_ids[0]))
97
  p1 = MinNewTokensLengthLogitsProcessor(
98
  prompt_length_to_skip=input_ids.shape[-1],
99
  min_new_tokens=5,
@@ -114,7 +114,7 @@ def captioner(
114
  logits_processor_list=[p1, visual_logits_processor],
115
  )
116
  if debug:
117
- print("outputs--->",tokenizer.decode(outputs[0]))
118
  if outputs[0, -2] in [previsual_token_id, visual_token_id] and outputs[0, -1] == bos_token_id:
119
  prompt = tokenizer.decode(outputs.clone()[0])
120
  is_visual = (outputs[0, -2] == visual_token_id)
@@ -132,7 +132,7 @@ def captioner(
132
  image_start_index_list = [[x] for x in image_start_index_list]
133
  image_nums = [1] * len(input_ids)
134
  if debug:
135
- print("get the visual bbox--->",tokenizer.decode(input_ids[0]))
136
  with torch.no_grad():
137
  outputs = model(
138
  vision_x=batch_images,
@@ -145,6 +145,8 @@ def captioner(
145
  )
146
  boxes = outputs["boxes"]
147
  scores = outputs["scores"]
 
 
148
  # if not model.valid:
149
  # import pdb; pdb.set_trace()
150
  if boxes is not None:
@@ -168,7 +170,8 @@ def captioner(
168
  open_cv_image = np.array(image_ori)
169
  open_cv_image = open_cv_image[:, :, ::-1].copy()
170
  for i, pre_box in enumerate(boxes):
171
- open_cv_image = cv2.rectangle(open_cv_image, pre_box[:2].astype(int), pre_box[2:].astype(int), (0, 255, 0), i+1)
 
172
  out_image = Image.fromarray(cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2RGB))
173
  # exit()
174
  pre_box = boxes[scores.argmax()]
@@ -181,7 +184,14 @@ def captioner(
181
  else:
182
  # if debug:
183
  # import pdb;pdb.set_trace()
 
 
 
184
  prompt = tokenizer.decode(outputs[0, :-2].clone()[0])
 
 
 
 
185
  else:
186
  break
187
  outputs = outputs[:, ori_prompt_length:]
@@ -190,7 +200,8 @@ def captioner(
190
  # postprocess_captioning_generation(out).replace('"', "")
191
  # for out in tokenizer.batch_decode(outputs, skip_special_tokens=True)
192
  # ]
193
- # import pdb; pdb.set_trace()
 
194
  return outputs, out_image
195
 
196
 
@@ -428,5 +439,4 @@ def evaluate_coco_flickr(
428
  metrics = {}
429
  metrics["CIDEr"] = 0.0
430
 
431
-
432
  return metrics["CIDEr"]
 
51
 
52
 
53
  def captioner(
54
+ model, tokenizer, image_ori, batch_images, input_ids, attention_mask, image_start_index_list, image_nums,
55
+ added_bbox_list, debug=True):
56
  """Evaluate a model on COCO dataset.
57
  Returns:
58
  float: CIDEr score
 
81
  input_ids = input_ids
82
  attention_mask = attention_mask
83
  else:
 
84
  encodings = tokenizer(
85
  [prompt],
86
  padding="longest",
 
93
  image_start_index_list = image_start_index_list
94
  image_nums = image_nums
95
  if debug:
96
+ print("input--->", tokenizer.decode(input_ids[0]))
97
  p1 = MinNewTokensLengthLogitsProcessor(
98
  prompt_length_to_skip=input_ids.shape[-1],
99
  min_new_tokens=5,
 
114
  logits_processor_list=[p1, visual_logits_processor],
115
  )
116
  if debug:
117
+ print("outputs--->", tokenizer.decode(outputs[0]))
118
  if outputs[0, -2] in [previsual_token_id, visual_token_id] and outputs[0, -1] == bos_token_id:
119
  prompt = tokenizer.decode(outputs.clone()[0])
120
  is_visual = (outputs[0, -2] == visual_token_id)
 
132
  image_start_index_list = [[x] for x in image_start_index_list]
133
  image_nums = [1] * len(input_ids)
134
  if debug:
135
+ print("get the visual bbox--->", tokenizer.decode(input_ids[0]))
136
  with torch.no_grad():
137
  outputs = model(
138
  vision_x=batch_images,
 
145
  )
146
  boxes = outputs["boxes"]
147
  scores = outputs["scores"]
148
+ if debug:
149
+ print("box num---->", len(boxes))
150
  # if not model.valid:
151
  # import pdb; pdb.set_trace()
152
  if boxes is not None:
 
170
  open_cv_image = np.array(image_ori)
171
  open_cv_image = open_cv_image[:, :, ::-1].copy()
172
  for i, pre_box in enumerate(boxes):
173
+ open_cv_image = cv2.rectangle(open_cv_image, pre_box[:2].astype(int), pre_box[2:].astype(int),
174
+ (0, 255, 0), i + 1)
175
  out_image = Image.fromarray(cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2RGB))
176
  # exit()
177
  pre_box = boxes[scores.argmax()]
 
184
  else:
185
  # if debug:
186
  # import pdb;pdb.set_trace()
187
+ prompt = tokenizer.decode(outputs.clone()[0])
188
+ if debug:
189
+ print("before else---->", prompt)
190
  prompt = tokenizer.decode(outputs[0, :-2].clone()[0])
191
+ if debug:
192
+ print("after else---->", prompt)
193
+
194
+
195
  else:
196
  break
197
  outputs = outputs[:, ori_prompt_length:]
 
200
  # postprocess_captioning_generation(out).replace('"', "")
201
  # for out in tokenizer.batch_decode(outputs, skip_special_tokens=True)
202
  # ]
203
+ # import pdb; pdb.set_trace()
204
+ print("out----------------------------------------------------------------------------------------->")
205
  return outputs, out_image
206
 
207
 
 
439
  metrics = {}
440
  metrics["CIDEr"] = 0.0
441
 
 
442
  return metrics["CIDEr"]