chendl's picture
Add application file
0b7b08a
from torch.utils.data import Dataset
import json
from PIL import Image
import os
import torch
import more_itertools
from tqdm import tqdm
import time
from vqa_metric import compute_gqa_accuracy
import string
import uuid
import numpy as np
import cv2
from open_flamingo.eval.task.utils import get_bbox
class GQADataset(Dataset):
def __init__(
self,
image_dir_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/datasets/raw/gqa/images",
annotations_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/datasets/raw/gqa/testdev_balanced_questions.json",
):
annotations = json.load(open(annotations_path))
self.questions = []
self.answers = []
self.image_paths = []
self.question_ids = []
for anno_id in annotations:
question = annotations[anno_id]["question"]
imageId = annotations[anno_id]["imageId"]
answer = annotations[anno_id]["answer"]
self.questions.append(question)
self.answers.append(answer)
self.image_paths.append(os.path.join(image_dir_path, "{}.jpg".format(imageId)))
self.question_ids.append(anno_id)
# print(annotations[anno_id]["types"])
self.vqa_dataset = "gqa"
def __len__(self):
return len(self.questions)
def __getitem__(self, idx):
question = self.questions[idx]
question_id = self.question_ids[idx]
answer = self.answers[idx]
img_path = self.image_paths[idx]
image = Image.open(img_path)
return {
"image": image,
"question": question,
"answers": answer,
"question_id": question_id,
}
def prepare_batch_images(batch, image_processor):
batch_images = None
for b in batch:
b_image = image_processor(b["image"]).unsqueeze(0).unsqueeze(1).unsqueeze(0)
if batch_images is None:
batch_images = b_image
else:
batch_images = torch.cat([batch_images, b_image], dim=0)
return batch_images
def evaluate_gqa(
model,
tokenizer,
image_processor,
batch_size=1,
vis_embed_size=None,
rank=0,
world_size=1,
id=0,
):
"""
Evaluate a model on VQA datasets. Currently supports VQA v2.0.
Args:
model (nn.Module): model to evaluate
tokenizer (transformers.PreTrainedTokenizer): tokenizer for the model
image_processor : image processor for the model
batch_size (int): batch size
image_dir_path (str): path to image directory
questions_json_path (str): path to questions json file
annotations_json_path (str): path to annotations json file
seed (int, optional): random seed. Defaults to 42.
max_generation_length (int, optional): max generation length. Defaults to 5.
num_beams (int, optional): number of beams to use for beam search. Defaults to 3.
length_penalty (float, optional): length penalty for beam search. Defaults to -2.0.
num_samples (int, optional): number of samples to evaluate on. Defaults to 5000 samples.
query_set_size (int, optional): size of the query set. Defaults to 2048.
num_shots (int, optional): number of shots to use. Defaults to 8.
device (int, optional): device to use. Defaults to -1 (cpu).
num_workers (int, optional): number of workers to use. Defaults to 4.
vqa_dataset (string): type of vqa dataset: currently supports vqa, ok_vqa. Defaults to vqa.
Returns:
float: accuracy score
"""
assert batch_size == 1
vqa_dataset = "gqa"
eval_dataset = GQADataset()
object_token_id = tokenizer("<|#object#|>", add_special_tokens=False)["input_ids"][-1]
endofobject_token_id = tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1]
prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
def get_prompt(sample):
return f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>Question: {sample['question'].strip()} Short answer:"
model.eval().cuda()
lang_encoder_name = model.lang_encoder.__class__.__name__.lower()
predictions = []
if batch_size != 1:
tokenizer.padding_side = "left"
if world_size > 1:
torch.distributed.barrier()
this_tot = 0
for ii, batch in enumerate(more_itertools.chunked(
tqdm(eval_dataset, desc="Running inference", disable=(rank != 0)), batch_size,
)):
if ii % world_size != rank:
continue
batch[0]["image"] = batch[0]["image"].resize((224, 224))
batch_images = prepare_batch_images(
batch=batch,
image_processor=image_processor,
).cuda()
batch_text = [get_prompt(s) for s in batch]
encodings = tokenizer(
batch_text,
return_tensors="pt",
padding="longest",
truncation=True,
max_length=2000,
)
input_ids = encodings["input_ids"].cuda()
attention_mask = encodings["attention_mask"].cuda()
image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
image_start_index_list = [[x] for x in image_start_index_list]
image_nums = [1] * len(input_ids)
with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
outputs = model.generate(
batch_images,
input_ids,
attention_mask=attention_mask,
max_new_tokens=10,
min_length=1,
num_beams=1,
# length_penalty=0,
image_start_index_list=image_start_index_list,
image_nums=image_nums,
added_bbox_list=None,
return_dict_in_generate=True,
output_scores=True,
)
scores = outputs.scores
outputs = outputs.sequences[:, len(input_ids[0]) :]
if object_token_id in scores[0][0].sort(descending=True).indices[:5]:
sample = batch[0]
# print("="*80)
# print("sample:", batch, scores[0][0].sort(descending=True).indices[:10].tolist().index(object_token_id))
prompt1 = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>Question: {sample['question'].strip()} Short answer:<|#object#|><|#previsual#|>"]
boxes, scores = get_bbox(None, batch_images, prompt1, model, tokenizer, media_token_id, prebox_token_id, return_all=True)
# open_cv_image = np.array(sample["image"])
# open_cv_image = open_cv_image[:, :, ::-1].copy()
# cv2.imwrite(f"Atest_ori.png", open_cv_image)
# open_cv_image = cv2.rectangle(open_cv_image, boxes[0][:2].astype(int), boxes[0][2:].astype(int), (0, 255, 0), 2)
# print(scores)
# cv2.imwrite(f"Atest.png", open_cv_image)
if boxes is not None and len(boxes) > 0:
prompt2 = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>Question: {sample['question'].strip()} Short answer: it is<|#object#|><|#previsual#|><|#prebox#|><|#object#|> a"]
encodings = tokenizer(
prompt2,
return_tensors="pt",
padding="longest",
truncation=True,
max_length=2000,
)
input_ids = encodings["input_ids"].cuda()
attention_mask = encodings["attention_mask"].cuda()
image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
image_start_index_list = [[x] for x in image_start_index_list]
image_nums = [1] * len(input_ids)
added_bbox_list = [torch.tensor(boxes[0]/224.0).cuda().unsqueeze(0).clamp(0, 0.99)]
with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
outputs = model.generate(
batch_images,
input_ids,
attention_mask=attention_mask,
max_new_tokens=10,
min_length=1,
num_beams=1,
image_start_index_list=image_start_index_list,
image_nums=image_nums,
added_bbox_list=added_bbox_list,
eos_token_id=(endofobject_token_id),
)
outputs = outputs[:, len(input_ids[0]) :]
# print("previsual===>{}".format(tokenizer.decode(outputs[0], skip_special_tokens=True).strip().lower().strip(string.punctuation+" ")))
# postprocess begin
new_predictions = [
out.strip().lower().strip(string.punctuation+" ") for out in tokenizer.batch_decode(outputs, skip_special_tokens=True)
]
this_tot += 1
predictions.extend(
[
{"answer": p, "question_id": sample["question_id"], "_question": sample["question"], "answers": sample["answers"]}
for p, sample in zip(new_predictions, batch)
]
)
with open(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank}_{id}.json", "w") as f:
f.write(json.dumps(predictions))
print("save to", f"{vqa_dataset}_{lang_encoder_name}_results_part{rank}_{id}.json")
time.sleep(10)
if world_size > 1:
torch.distributed.barrier()
if rank == 0:
print(f"evaluate on rank {rank}. world size is {world_size}")
predictions = []
for rank_i in range(world_size):
print("load", f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json")
predictions.extend(json.load(open(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json")))
os.remove(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json")
print("num:", len(predictions))
# save the predictions to a temporary file
random_uuid = str(uuid.uuid4())
with open(f"{vqa_dataset}results_{random_uuid}.json", "w") as f:
f.write(json.dumps(predictions, indent=4))
acc = compute_gqa_accuracy(predictions)
print(vqa_dataset, "score:", acc, "| save to", f"{vqa_dataset}results_{random_uuid}.json")
os.makedirs("eval_results", exist_ok=True)
with open(os.path.join("eval_results", f"{vqa_dataset}_{model.expr_name}_{model.step_num}_{int(time.time())}_{acc}"), "w") as f:
f.write(json.dumps(predictions, indent=2))
# delete the temporary file
os.remove(f"{vqa_dataset}results_{random_uuid}.json")
else:
time.sleep(5)
acc = 0.0
if world_size > 1:
torch.distributed.barrier()
return acc