from torch.utils.data import Dataset import json from PIL import Image import os import torch import more_itertools from tqdm import tqdm import time from vqa_metric import compute_gqa_accuracy import string import uuid import numpy as np import cv2 from open_flamingo.eval.task.utils import get_bbox class GQADataset(Dataset): def __init__( self, image_dir_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/datasets/raw/gqa/images", annotations_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/datasets/raw/gqa/testdev_balanced_questions.json", ): annotations = json.load(open(annotations_path)) self.questions = [] self.answers = [] self.image_paths = [] self.question_ids = [] for anno_id in annotations: question = annotations[anno_id]["question"] imageId = annotations[anno_id]["imageId"] answer = annotations[anno_id]["answer"] self.questions.append(question) self.answers.append(answer) self.image_paths.append(os.path.join(image_dir_path, "{}.jpg".format(imageId))) self.question_ids.append(anno_id) # print(annotations[anno_id]["types"]) self.vqa_dataset = "gqa" def __len__(self): return len(self.questions) def __getitem__(self, idx): question = self.questions[idx] question_id = self.question_ids[idx] answer = self.answers[idx] img_path = self.image_paths[idx] image = Image.open(img_path) return { "image": image, "question": question, "answers": answer, "question_id": question_id, } def prepare_batch_images(batch, image_processor): batch_images = None for b in batch: b_image = image_processor(b["image"]).unsqueeze(0).unsqueeze(1).unsqueeze(0) if batch_images is None: batch_images = b_image else: batch_images = torch.cat([batch_images, b_image], dim=0) return batch_images def evaluate_gqa( model, tokenizer, image_processor, batch_size=1, vis_embed_size=None, rank=0, world_size=1, id=0, ): """ Evaluate a model on VQA datasets. Currently supports VQA v2.0. Args: model (nn.Module): model to evaluate tokenizer (transformers.PreTrainedTokenizer): tokenizer for the model image_processor : image processor for the model batch_size (int): batch size image_dir_path (str): path to image directory questions_json_path (str): path to questions json file annotations_json_path (str): path to annotations json file seed (int, optional): random seed. Defaults to 42. max_generation_length (int, optional): max generation length. Defaults to 5. num_beams (int, optional): number of beams to use for beam search. Defaults to 3. length_penalty (float, optional): length penalty for beam search. Defaults to -2.0. num_samples (int, optional): number of samples to evaluate on. Defaults to 5000 samples. query_set_size (int, optional): size of the query set. Defaults to 2048. num_shots (int, optional): number of shots to use. Defaults to 8. device (int, optional): device to use. Defaults to -1 (cpu). num_workers (int, optional): number of workers to use. Defaults to 4. vqa_dataset (string): type of vqa dataset: currently supports vqa, ok_vqa. Defaults to vqa. Returns: float: accuracy score """ assert batch_size == 1 vqa_dataset = "gqa" eval_dataset = GQADataset() object_token_id = tokenizer("<|#object#|>", add_special_tokens=False)["input_ids"][-1] endofobject_token_id = tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1] prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1] media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1] endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1] pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1] bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1] def get_prompt(sample): return f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>Question: {sample['question'].strip()} Short answer:" model.eval().cuda() lang_encoder_name = model.lang_encoder.__class__.__name__.lower() predictions = [] if batch_size != 1: tokenizer.padding_side = "left" if world_size > 1: torch.distributed.barrier() this_tot = 0 for ii, batch in enumerate(more_itertools.chunked( tqdm(eval_dataset, desc="Running inference", disable=(rank != 0)), batch_size, )): if ii % world_size != rank: continue batch[0]["image"] = batch[0]["image"].resize((224, 224)) batch_images = prepare_batch_images( batch=batch, image_processor=image_processor, ).cuda() batch_text = [get_prompt(s) for s in batch] encodings = tokenizer( batch_text, return_tensors="pt", padding="longest", truncation=True, max_length=2000, ) input_ids = encodings["input_ids"].cuda() attention_mask = encodings["attention_mask"].cuda() image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist() image_start_index_list = [[x] for x in image_start_index_list] image_nums = [1] * len(input_ids) with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16): outputs = model.generate( batch_images, input_ids, attention_mask=attention_mask, max_new_tokens=10, min_length=1, num_beams=1, # length_penalty=0, image_start_index_list=image_start_index_list, image_nums=image_nums, added_bbox_list=None, return_dict_in_generate=True, output_scores=True, ) scores = outputs.scores outputs = outputs.sequences[:, len(input_ids[0]) :] if object_token_id in scores[0][0].sort(descending=True).indices[:5]: sample = batch[0] # print("="*80) # print("sample:", batch, scores[0][0].sort(descending=True).indices[:10].tolist().index(object_token_id)) prompt1 = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>Question: {sample['question'].strip()} Short answer:<|#object#|><|#previsual#|>"] boxes, scores = get_bbox(None, batch_images, prompt1, model, tokenizer, media_token_id, prebox_token_id, return_all=True) # open_cv_image = np.array(sample["image"]) # open_cv_image = open_cv_image[:, :, ::-1].copy() # cv2.imwrite(f"Atest_ori.png", open_cv_image) # open_cv_image = cv2.rectangle(open_cv_image, boxes[0][:2].astype(int), boxes[0][2:].astype(int), (0, 255, 0), 2) # print(scores) # cv2.imwrite(f"Atest.png", open_cv_image) if boxes is not None and len(boxes) > 0: prompt2 = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>Question: {sample['question'].strip()} Short answer: it is<|#object#|><|#previsual#|><|#prebox#|><|#object#|> a"] encodings = tokenizer( prompt2, return_tensors="pt", padding="longest", truncation=True, max_length=2000, ) input_ids = encodings["input_ids"].cuda() attention_mask = encodings["attention_mask"].cuda() image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist() image_start_index_list = [[x] for x in image_start_index_list] image_nums = [1] * len(input_ids) added_bbox_list = [torch.tensor(boxes[0]/224.0).cuda().unsqueeze(0).clamp(0, 0.99)] with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16): outputs = model.generate( batch_images, input_ids, attention_mask=attention_mask, max_new_tokens=10, min_length=1, num_beams=1, image_start_index_list=image_start_index_list, image_nums=image_nums, added_bbox_list=added_bbox_list, eos_token_id=(endofobject_token_id), ) outputs = outputs[:, len(input_ids[0]) :] # print("previsual===>{}".format(tokenizer.decode(outputs[0], skip_special_tokens=True).strip().lower().strip(string.punctuation+" "))) # postprocess begin new_predictions = [ out.strip().lower().strip(string.punctuation+" ") for out in tokenizer.batch_decode(outputs, skip_special_tokens=True) ] this_tot += 1 predictions.extend( [ {"answer": p, "question_id": sample["question_id"], "_question": sample["question"], "answers": sample["answers"]} for p, sample in zip(new_predictions, batch) ] ) with open(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank}_{id}.json", "w") as f: f.write(json.dumps(predictions)) print("save to", f"{vqa_dataset}_{lang_encoder_name}_results_part{rank}_{id}.json") time.sleep(10) if world_size > 1: torch.distributed.barrier() if rank == 0: print(f"evaluate on rank {rank}. world size is {world_size}") predictions = [] for rank_i in range(world_size): print("load", f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json") predictions.extend(json.load(open(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json"))) os.remove(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json") print("num:", len(predictions)) # save the predictions to a temporary file random_uuid = str(uuid.uuid4()) with open(f"{vqa_dataset}results_{random_uuid}.json", "w") as f: f.write(json.dumps(predictions, indent=4)) acc = compute_gqa_accuracy(predictions) print(vqa_dataset, "score:", acc, "| save to", f"{vqa_dataset}results_{random_uuid}.json") os.makedirs("eval_results", exist_ok=True) with open(os.path.join("eval_results", f"{vqa_dataset}_{model.expr_name}_{model.step_num}_{int(time.time())}_{acc}"), "w") as f: f.write(json.dumps(predictions, indent=2)) # delete the temporary file os.remove(f"{vqa_dataset}results_{random_uuid}.json") else: time.sleep(5) acc = 0.0 if world_size > 1: torch.distributed.barrier() return acc