Spaces:
Runtime error
Runtime error
from torch.utils.data import Dataset | |
import json | |
from PIL import Image | |
import os | |
import torch | |
import more_itertools | |
from tqdm import tqdm | |
import time | |
from vqa_metric import compute_gqa_accuracy | |
import string | |
import uuid | |
import numpy as np | |
import cv2 | |
from open_flamingo.eval.task.utils import get_bbox | |
class GQADataset(Dataset): | |
def __init__( | |
self, | |
image_dir_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/datasets/raw/gqa/images", | |
annotations_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/datasets/raw/gqa/testdev_balanced_questions.json", | |
): | |
annotations = json.load(open(annotations_path)) | |
self.questions = [] | |
self.answers = [] | |
self.image_paths = [] | |
self.question_ids = [] | |
for anno_id in annotations: | |
question = annotations[anno_id]["question"] | |
imageId = annotations[anno_id]["imageId"] | |
answer = annotations[anno_id]["answer"] | |
self.questions.append(question) | |
self.answers.append(answer) | |
self.image_paths.append(os.path.join(image_dir_path, "{}.jpg".format(imageId))) | |
self.question_ids.append(anno_id) | |
# print(annotations[anno_id]["types"]) | |
self.vqa_dataset = "gqa" | |
def __len__(self): | |
return len(self.questions) | |
def __getitem__(self, idx): | |
question = self.questions[idx] | |
question_id = self.question_ids[idx] | |
answer = self.answers[idx] | |
img_path = self.image_paths[idx] | |
image = Image.open(img_path) | |
return { | |
"image": image, | |
"question": question, | |
"answers": answer, | |
"question_id": question_id, | |
} | |
def prepare_batch_images(batch, image_processor): | |
batch_images = None | |
for b in batch: | |
b_image = image_processor(b["image"]).unsqueeze(0).unsqueeze(1).unsqueeze(0) | |
if batch_images is None: | |
batch_images = b_image | |
else: | |
batch_images = torch.cat([batch_images, b_image], dim=0) | |
return batch_images | |
def evaluate_gqa( | |
model, | |
tokenizer, | |
image_processor, | |
batch_size=1, | |
vis_embed_size=None, | |
rank=0, | |
world_size=1, | |
id=0, | |
): | |
""" | |
Evaluate a model on VQA datasets. Currently supports VQA v2.0. | |
Args: | |
model (nn.Module): model to evaluate | |
tokenizer (transformers.PreTrainedTokenizer): tokenizer for the model | |
image_processor : image processor for the model | |
batch_size (int): batch size | |
image_dir_path (str): path to image directory | |
questions_json_path (str): path to questions json file | |
annotations_json_path (str): path to annotations json file | |
seed (int, optional): random seed. Defaults to 42. | |
max_generation_length (int, optional): max generation length. Defaults to 5. | |
num_beams (int, optional): number of beams to use for beam search. Defaults to 3. | |
length_penalty (float, optional): length penalty for beam search. Defaults to -2.0. | |
num_samples (int, optional): number of samples to evaluate on. Defaults to 5000 samples. | |
query_set_size (int, optional): size of the query set. Defaults to 2048. | |
num_shots (int, optional): number of shots to use. Defaults to 8. | |
device (int, optional): device to use. Defaults to -1 (cpu). | |
num_workers (int, optional): number of workers to use. Defaults to 4. | |
vqa_dataset (string): type of vqa dataset: currently supports vqa, ok_vqa. Defaults to vqa. | |
Returns: | |
float: accuracy score | |
""" | |
assert batch_size == 1 | |
vqa_dataset = "gqa" | |
eval_dataset = GQADataset() | |
object_token_id = tokenizer("<|#object#|>", add_special_tokens=False)["input_ids"][-1] | |
endofobject_token_id = tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1] | |
prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1] | |
media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1] | |
endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1] | |
pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1] | |
bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1] | |
def get_prompt(sample): | |
return f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>Question: {sample['question'].strip()} Short answer:" | |
model.eval().cuda() | |
lang_encoder_name = model.lang_encoder.__class__.__name__.lower() | |
predictions = [] | |
if batch_size != 1: | |
tokenizer.padding_side = "left" | |
if world_size > 1: | |
torch.distributed.barrier() | |
this_tot = 0 | |
for ii, batch in enumerate(more_itertools.chunked( | |
tqdm(eval_dataset, desc="Running inference", disable=(rank != 0)), batch_size, | |
)): | |
if ii % world_size != rank: | |
continue | |
batch[0]["image"] = batch[0]["image"].resize((224, 224)) | |
batch_images = prepare_batch_images( | |
batch=batch, | |
image_processor=image_processor, | |
).cuda() | |
batch_text = [get_prompt(s) for s in batch] | |
encodings = tokenizer( | |
batch_text, | |
return_tensors="pt", | |
padding="longest", | |
truncation=True, | |
max_length=2000, | |
) | |
input_ids = encodings["input_ids"].cuda() | |
attention_mask = encodings["attention_mask"].cuda() | |
image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist() | |
image_start_index_list = [[x] for x in image_start_index_list] | |
image_nums = [1] * len(input_ids) | |
with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16): | |
outputs = model.generate( | |
batch_images, | |
input_ids, | |
attention_mask=attention_mask, | |
max_new_tokens=10, | |
min_length=1, | |
num_beams=1, | |
# length_penalty=0, | |
image_start_index_list=image_start_index_list, | |
image_nums=image_nums, | |
added_bbox_list=None, | |
return_dict_in_generate=True, | |
output_scores=True, | |
) | |
scores = outputs.scores | |
outputs = outputs.sequences[:, len(input_ids[0]) :] | |
if object_token_id in scores[0][0].sort(descending=True).indices[:5]: | |
sample = batch[0] | |
# print("="*80) | |
# print("sample:", batch, scores[0][0].sort(descending=True).indices[:10].tolist().index(object_token_id)) | |
prompt1 = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>Question: {sample['question'].strip()} Short answer:<|#object#|><|#previsual#|>"] | |
boxes, scores = get_bbox(None, batch_images, prompt1, model, tokenizer, media_token_id, prebox_token_id, return_all=True) | |
# open_cv_image = np.array(sample["image"]) | |
# open_cv_image = open_cv_image[:, :, ::-1].copy() | |
# cv2.imwrite(f"Atest_ori.png", open_cv_image) | |
# open_cv_image = cv2.rectangle(open_cv_image, boxes[0][:2].astype(int), boxes[0][2:].astype(int), (0, 255, 0), 2) | |
# print(scores) | |
# cv2.imwrite(f"Atest.png", open_cv_image) | |
if boxes is not None and len(boxes) > 0: | |
prompt2 = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>Question: {sample['question'].strip()} Short answer: it is<|#object#|><|#previsual#|><|#prebox#|><|#object#|> a"] | |
encodings = tokenizer( | |
prompt2, | |
return_tensors="pt", | |
padding="longest", | |
truncation=True, | |
max_length=2000, | |
) | |
input_ids = encodings["input_ids"].cuda() | |
attention_mask = encodings["attention_mask"].cuda() | |
image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist() | |
image_start_index_list = [[x] for x in image_start_index_list] | |
image_nums = [1] * len(input_ids) | |
added_bbox_list = [torch.tensor(boxes[0]/224.0).cuda().unsqueeze(0).clamp(0, 0.99)] | |
with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16): | |
outputs = model.generate( | |
batch_images, | |
input_ids, | |
attention_mask=attention_mask, | |
max_new_tokens=10, | |
min_length=1, | |
num_beams=1, | |
image_start_index_list=image_start_index_list, | |
image_nums=image_nums, | |
added_bbox_list=added_bbox_list, | |
eos_token_id=(endofobject_token_id), | |
) | |
outputs = outputs[:, len(input_ids[0]) :] | |
# print("previsual===>{}".format(tokenizer.decode(outputs[0], skip_special_tokens=True).strip().lower().strip(string.punctuation+" "))) | |
# postprocess begin | |
new_predictions = [ | |
out.strip().lower().strip(string.punctuation+" ") for out in tokenizer.batch_decode(outputs, skip_special_tokens=True) | |
] | |
this_tot += 1 | |
predictions.extend( | |
[ | |
{"answer": p, "question_id": sample["question_id"], "_question": sample["question"], "answers": sample["answers"]} | |
for p, sample in zip(new_predictions, batch) | |
] | |
) | |
with open(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank}_{id}.json", "w") as f: | |
f.write(json.dumps(predictions)) | |
print("save to", f"{vqa_dataset}_{lang_encoder_name}_results_part{rank}_{id}.json") | |
time.sleep(10) | |
if world_size > 1: | |
torch.distributed.barrier() | |
if rank == 0: | |
print(f"evaluate on rank {rank}. world size is {world_size}") | |
predictions = [] | |
for rank_i in range(world_size): | |
print("load", f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json") | |
predictions.extend(json.load(open(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json"))) | |
os.remove(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json") | |
print("num:", len(predictions)) | |
# save the predictions to a temporary file | |
random_uuid = str(uuid.uuid4()) | |
with open(f"{vqa_dataset}results_{random_uuid}.json", "w") as f: | |
f.write(json.dumps(predictions, indent=4)) | |
acc = compute_gqa_accuracy(predictions) | |
print(vqa_dataset, "score:", acc, "| save to", f"{vqa_dataset}results_{random_uuid}.json") | |
os.makedirs("eval_results", exist_ok=True) | |
with open(os.path.join("eval_results", f"{vqa_dataset}_{model.expr_name}_{model.step_num}_{int(time.time())}_{acc}"), "w") as f: | |
f.write(json.dumps(predictions, indent=2)) | |
# delete the temporary file | |
os.remove(f"{vqa_dataset}results_{random_uuid}.json") | |
else: | |
time.sleep(5) | |
acc = 0.0 | |
if world_size > 1: | |
torch.distributed.barrier() | |
return acc | |