import torch from tqdm import tqdm from PIL import Image from io import BytesIO import base64 import numpy as np import time import json import os import cv2 from coco_metric import compute_cider import random import pickle def evaluate_reg( model, tokenizer, image_processor, vis_embed_size=None, rank=0, world_size=1, id=0, ): lang_encoder_name = model.lang_encoder.__class__.__name__.lower() dataset_name = "refcocog" pkl_file = "/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal2/open_flamingo/eval/task/others/refcocog_reg_val_data.pkl" try: media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1] endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1] pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1] bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1] except: pass model.eval().cuda() if world_size > 1: torch.distributed.barrier() this_tot = 0 predictions = [] D = pickle.load(open(pkl_file, "rb")) lines = [] data = D["data"] uniq_id_to_text = D["uniq_id_to_text"] uniq_id_to_image = D["uniq_id_to_image"] uniq_id_to_image_id = D["uniq_id_to_image_id"] for image_id in data: for region in data[image_id]: uniq_id = data[image_id][region][0] lines.append([uniq_id, uniq_id_to_image_id[uniq_id], [uniq_id_to_text[r] for r in data[image_id][region]], region, uniq_id_to_image[uniq_id]]) print("total data:", len(lines)) # lines = lines[:20] pbar = tqdm(lines, disable=(rank != 0)) for ii, line in enumerate(pbar): if ii % world_size != rank: continue uniq_id, image_id, text, region_coord, image = line gt_box = np.array(region_coord) width = image.width height = image.height image = image.resize((224, 224)) gt_box = gt_box / np.array([width, height, width, height]) * 224 batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0) prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|><|#object#|><|#previsual#|><|#prebox#|><|#object#|>"] encodings = tokenizer( prompt, padding="longest", truncation=True, return_tensors="pt", max_length=2000, ) input_ids = encodings["input_ids"] attention_mask = encodings["attention_mask"] image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist() image_start_index_list = [[x] for x in image_start_index_list] image_nums = [1] * len(input_ids) batch_images = batch_images.cuda() input_ids = input_ids.cuda() attention_mask = attention_mask.cuda() added_bbox_list = [(torch.tensor(gt_box).cuda() / 224).clamp(0, 0.99).unsqueeze(0)] with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16): outputs = model.generate( batch_images, input_ids, attention_mask=attention_mask, max_new_tokens=25, min_length=5, num_beams=8, length_penalty=0, image_start_index_list=image_start_index_list, image_nums=image_nums, added_bbox_list=added_bbox_list, ) outputs = outputs[:, len(input_ids[0]) :] new_prediction = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].strip().lower() this_tot += 1 if rank == 0 and this_tot % 10 == 0: for i in range(1): tqdm.write(f"answer: {text}\nmodel output: {new_prediction}") predictions.append( {"image_id": image_id, "caption": new_prediction} ) results_path = f"reg_{lang_encoder_name}_{rank}_{id}.json" json.dump(predictions, open(results_path, "w")) print("save to", results_path) del predictions time.sleep(5) if world_size > 1: torch.distributed.barrier() if rank == 0: print(f"evaluate on rank {rank}. world size is {world_size}") predictions = [] for rank_i in range(world_size): part_results_path = f"reg_{lang_encoder_name}_{rank_i}_{id}.json" print("load", part_results_path) part_data = json.load(open(part_results_path)) predictions.extend(part_data) os.remove(part_results_path) print("num:", len(predictions)) results_path = f"reg_{lang_encoder_name}_{id}_result.json" json.dump(predictions, open(results_path, "w"), indent=2) metrics = compute_cider( result_path=results_path, annotations_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal2/open_flamingo/eval/task/others/refcocog_reg_val_label.json", ) os.makedirs("eval_results", exist_ok=True) cider = metrics["CIDEr"] print("cider", cider) with open(os.path.join("eval_results", f"reg_{model.expr_name}_{model.step_num}_{int(time.time())}_{cider}"), "w") as f: f.write(json.dumps(predictions, indent=2)) # delete the temporary file os.remove(results_path) return cider if __name__ == "__main__": anno = json.load(open("/gpfs/u/home/LMCG/LMCGljnn/scratch/.cache/lavis/coco_gt/coco_karpathy_test_gt.json")) import pdb; pdb.set_trace() print(anno.keys())