Spaces:
Runtime error
Runtime error
import json | |
import webdataset as wds | |
from tqdm import tqdm | |
from PIL import Image | |
import torch | |
import numpy as np | |
import os | |
import time | |
import cv2 | |
import random | |
import math | |
from open_flamingo.eval.task.utils import ( | |
get_object_from_text, | |
is_correct, | |
_eval_text_image, | |
get_bbox, | |
get_iou, | |
) | |
DATASET = "/gpfs/u/home/LMCG/LMCGljnn/scratch/code/COLA/data/COLA_multiobjects_matching_benchmark.json" | |
VG_ROOT = "/gpfs/u/home/LMCG/LMCGljnn/scratch/datasets/raw/vg/VG_100K" | |
def get_score(image, text, model, tokenizer, image_processor, vis_embed_size): | |
media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1] | |
prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1] | |
object_token_id = tokenizer("<|#object#|>", add_special_tokens=False)["input_ids"][-1] | |
text = text.split("#") | |
obj_A = text[0].strip().split(" ") | |
relation = text[1].strip() | |
obj_B = text[2].strip().split(" ") | |
if "computer mouse" not in text[0].strip(): | |
attrAs = obj_A[:-1] | |
nounA = obj_A[-1] | |
else: | |
attrAs = obj_A[:-2] | |
nounA = " ".join(obj_A[-2:]) | |
if "computer mouse" not in text[2].strip(): | |
attrBs = obj_B[:-1] | |
nounB = obj_B[-1] | |
else: | |
attrBs = obj_B[:-2] | |
nounB = " ".join(obj_B[-2:]) | |
# print("="*80) | |
# print(attrAs, nounA) | |
# print(attrBs, nounB) | |
# print(relation) | |
# print("="*80) | |
batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0) | |
prompt1 = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|><|#object#|>the {nounA}<|#endofobject#|><|#visual#|>"] | |
boxes, scores = get_bbox(None, batch_images, prompt1, model, tokenizer, media_token_id, prebox_token_id, return_all=True) | |
# open_cv_image = np.array(image) | |
# open_cv_image = open_cv_image[:, :, ::-1].copy() | |
# for pre_box in boxes: | |
# open_cv_image = cv2.rectangle(open_cv_image, pre_box[:2].astype(int), pre_box[2:].astype(int), (0, 255, 0), 2) | |
box_ppl = [] | |
box_attr_losses = [] | |
for box in boxes: | |
losses = [] | |
for attrA in attrAs: | |
prompt2 = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|><|#object#|><|#previsual#|><|#prebox#|><|#object#|> the {attrA} {nounA}"] | |
encodings = tokenizer( | |
prompt2, | |
padding="longest", | |
truncation=True, | |
return_tensors="pt", | |
max_length=512, | |
) | |
input_ids = encodings["input_ids"] | |
attention_mask = encodings["attention_mask"] | |
image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist() | |
image_start_index_list = [[x] for x in image_start_index_list] | |
image_nums = [1] * len(input_ids) | |
vision_x = batch_images.cuda() | |
lang_x = input_ids.cuda() | |
attention_mask = attention_mask.cuda() | |
labels = lang_x.clone() | |
start_idx = (labels == object_token_id).nonzero()[-1, -1] | |
labels[0, :start_idx+1] = -100 | |
added_bbox_list = [torch.tensor(box / 224.0).cuda().unsqueeze(0)] | |
with torch.cuda.amp.autocast(dtype=torch.float16) and torch.no_grad(): | |
outputs = model( | |
vision_x=vision_x, | |
lang_x=lang_x, | |
attention_mask=attention_mask, | |
labels=labels, | |
image_nums=image_nums, | |
image_start_index_list=image_start_index_list, | |
added_bbox_list=added_bbox_list, | |
add_box=added_bbox_list is not None, | |
relations=None, | |
) | |
loss = outputs.loss | |
loss = (loss.sum() / (loss != 0).sum()).item() | |
losses.append(loss) | |
avg_ppl = np.array(losses).mean() | |
box_ppl.append(avg_ppl) | |
box_attr_losses.append(losses) | |
fit_idx = np.array(box_ppl).argmin() | |
fit_box = boxes[fit_idx] | |
fit_attr = attrAs[np.array(box_attr_losses[fit_idx]).argmin()] | |
first_ppl = min(box_ppl) | |
# open_cv_image = cv2.rectangle(open_cv_image, fit_box[:2].astype(int), fit_box[2:].astype(int), (255, 0, 0), 2) | |
prompt3 = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|><|#object#|>the {fit_attr} {nounA}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|> is {relation}<|#object#|><|#previsual#|>"] | |
boxes, scores = get_bbox([torch.tensor(fit_box / 224).cuda().unsqueeze(0)], batch_images, prompt3, model, tokenizer, media_token_id, prebox_token_id, return_all=True) | |
# for i, pre_box in enumerate(boxes): | |
# open_cv_image = cv2.rectangle(open_cv_image, pre_box[:2].astype(int), pre_box[2:].astype(int), (0, 0, 255), i+1) | |
# cv2.imwrite(f"Atest.png", open_cv_image) | |
box_ppl = [] | |
for box in boxes: | |
losses = [] | |
for attrB in attrBs: | |
prompt4 = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|><|#object#|>the {fit_attr} {nounA}<|#endofobject#|><|#visual#|><|#box#|><|#endofobject#|> is {relation}<|#object#|><|#previsual#|><|#prebox#|><|#object#|> the {attrB} {nounB}"] | |
encodings = tokenizer( | |
prompt4, | |
padding="longest", | |
truncation=True, | |
return_tensors="pt", | |
max_length=512, | |
) | |
input_ids = encodings["input_ids"] | |
attention_mask = encodings["attention_mask"] | |
image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist() | |
image_start_index_list = [[x] for x in image_start_index_list] | |
image_nums = [1] * len(input_ids) | |
vision_x = batch_images.cuda() | |
lang_x = input_ids.cuda() | |
attention_mask = attention_mask.cuda() | |
labels = lang_x.clone() | |
start_idx = (labels == object_token_id).nonzero()[-1, -1] | |
labels[0, :start_idx+1] = -100 | |
added_bbox_list = [torch.tensor(fit_box / 224.0).cuda().unsqueeze(0), torch.tensor(box / 224.0).cuda().unsqueeze(0)] | |
with torch.cuda.amp.autocast(dtype=torch.float16) and torch.no_grad(): | |
outputs = model( | |
vision_x=vision_x, | |
lang_x=lang_x, | |
attention_mask=attention_mask, | |
labels=labels, | |
image_nums=image_nums, | |
image_start_index_list=image_start_index_list, | |
added_bbox_list=added_bbox_list, | |
add_box=added_bbox_list is not None, | |
relations=None, | |
) | |
loss = outputs.loss | |
loss = (loss.sum() / (loss != 0).sum()).item() | |
losses.append(loss) | |
avg_ppl = np.array(losses).mean() | |
box_ppl.append(avg_ppl) | |
second_ppl = (np.array(box_ppl) * np.array(scores)).sum() / sum(scores) | |
return (first_ppl + second_ppl) / 2 | |
def evaluate_cola( | |
model, | |
tokenizer, | |
image_processor, | |
vis_embed_size=None, | |
rank=0, | |
world_size=1, | |
id=0, | |
debug=False, | |
): | |
dataset_name = "cola" | |
dataset = json.load(open(DATASET)) | |
model = model.cuda().eval() | |
correct = 0 | |
total = 0 | |
pbar = tqdm(dataset, disable=(rank != 0)) | |
for ii, sample in enumerate(pbar): | |
if ii % world_size != rank: | |
continue | |
image1 = Image.open(os.path.join(VG_ROOT, os.path.basename(sample[0]))).convert("RGB").resize((224, 224)) | |
text1 = sample[1] | |
image2 = Image.open(os.path.join(VG_ROOT, os.path.basename(sample[2]))).convert("RGB").resize((224, 224)) | |
text2 = sample[3] | |
score11 = -get_score(image1, text1, model, tokenizer, image_processor, vis_embed_size) | |
score12 = -get_score(image1, text2, model, tokenizer, image_processor, vis_embed_size) | |
score21 = -get_score(image2, text1, model, tokenizer, image_processor, vis_embed_size) | |
score22 = -get_score(image2, text2, model, tokenizer, image_processor, vis_embed_size) | |
if rank == 0: | |
tqdm.write(f"{score11:.2f} {score12:.2f} {score21:.2f} {score22:.2f}") | |
if score11 > score21 and score22 > score12: | |
correct += 1 | |
total += 1 | |
pbar.set_description(f"{correct / total:.2f}") | |
print(rank, correct / total) | |
with open(f"{dataset_name}_results_part{rank}_{id}.json", "w") as f: | |
f.write(json.dumps([total, correct])) | |
if world_size > 1: | |
torch.distributed.barrier() | |
if rank == 0: | |
total = 0 | |
correct = 0 | |
print(f"evaluate on rank {rank}. world size is {world_size}") | |
for rank_i in range(world_size): | |
[total_part, correct_part] = json.load(open(f"{dataset_name}_results_part{rank_i}_{id}.json")) | |
os.remove(f"{dataset_name}_results_part{rank_i}_{id}.json") | |
total += total_part | |
correct += correct_part | |
score = correct / total | |
print("score:", score) | |
with open(os.path.join("eval_results", f"{dataset_name}_{model.expr_name}_{model.step_num}_{int(time.time())}_{score}_{total}"), "w") as f: | |
pass | |
else: | |
score = 0.0 | |
if world_size > 1: | |
torch.distributed.barrier() | |
return score | |
if __name__ == "__main__": | |
evaluate_cola(None, None, None) | |