Spaces:

chendl
/

compositional_test

Runtime error

File size: 11,314 Bytes

0b7b08a

from torch.utils.data import Dataset
import json
from PIL import Image
import os
import torch
import more_itertools
from tqdm import tqdm
import time
from vqa_metric import compute_gqa_accuracy
import string
import uuid
import numpy as np
import cv2
from open_flamingo.eval.task.utils import get_bbox

class GQADataset(Dataset):
    def __init__(
        self,
        image_dir_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/datasets/raw/gqa/images",
        annotations_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/datasets/raw/gqa/testdev_balanced_questions.json",
    ):
        annotations = json.load(open(annotations_path))
        self.questions = []
        self.answers = []
        self.image_paths = []
        self.question_ids = []
        for anno_id in annotations:
            question = annotations[anno_id]["question"]
            imageId = annotations[anno_id]["imageId"]
            answer = annotations[anno_id]["answer"]
            self.questions.append(question)
            self.answers.append(answer)
            self.image_paths.append(os.path.join(image_dir_path, "{}.jpg".format(imageId)))
            self.question_ids.append(anno_id)
            # print(annotations[anno_id]["types"])
        self.vqa_dataset = "gqa"

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        question_id = self.question_ids[idx]
        answer = self.answers[idx]
        img_path = self.image_paths[idx]
        image = Image.open(img_path)
        return {
            "image": image,
            "question": question,
            "answers": answer,
            "question_id": question_id,
        }


def prepare_batch_images(batch, image_processor):
    batch_images = None
    for b in batch:
        b_image = image_processor(b["image"]).unsqueeze(0).unsqueeze(1).unsqueeze(0)
        if batch_images is None:
            batch_images = b_image
        else:
            batch_images = torch.cat([batch_images, b_image], dim=0)
    return batch_images



def evaluate_gqa(
    model,
    tokenizer,
    image_processor,
    batch_size=1,
    vis_embed_size=None,
    rank=0,
    world_size=1,
    id=0,
):
    """
    Evaluate a model on VQA datasets. Currently supports VQA v2.0.

    Args:
        model (nn.Module): model to evaluate
        tokenizer (transformers.PreTrainedTokenizer): tokenizer for the model
        image_processor : image processor for the model
        batch_size (int): batch size
        image_dir_path (str): path to image directory
        questions_json_path (str): path to questions json file
        annotations_json_path (str): path to annotations json file
        seed (int, optional): random seed. Defaults to 42.
        max_generation_length (int, optional): max generation length. Defaults to 5.
        num_beams (int, optional): number of beams to use for beam search. Defaults to 3.
        length_penalty (float, optional): length penalty for beam search. Defaults to -2.0.
        num_samples (int, optional): number of samples to evaluate on. Defaults to 5000 samples.
        query_set_size (int, optional): size of the query set. Defaults to 2048.
        num_shots (int, optional): number of shots to use. Defaults to 8.
        device (int, optional): device to use. Defaults to -1 (cpu).
        num_workers (int, optional): number of workers to use. Defaults to 4.
        vqa_dataset (string): type of vqa dataset: currently supports vqa, ok_vqa. Defaults to vqa.
    Returns:
        float: accuracy score
    """
    assert batch_size == 1
    vqa_dataset = "gqa"
    eval_dataset = GQADataset()
    object_token_id = tokenizer("<|#object#|>", add_special_tokens=False)["input_ids"][-1]
    endofobject_token_id = tokenizer("<|#endofobject#|>", add_special_tokens=False)["input_ids"][-1]
    prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
    media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
    endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
    pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
    bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
    def get_prompt(sample):
        return f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>Question: {sample['question'].strip()} Short answer:"
    model.eval().cuda()
    lang_encoder_name = model.lang_encoder.__class__.__name__.lower()
    predictions = []
    if batch_size != 1:
        tokenizer.padding_side = "left"
    if world_size > 1:
        torch.distributed.barrier()
    this_tot = 0
    for ii, batch in enumerate(more_itertools.chunked(
        tqdm(eval_dataset, desc="Running inference", disable=(rank != 0)), batch_size,
    )):
        if ii % world_size != rank:
            continue
        batch[0]["image"] = batch[0]["image"].resize((224, 224))
        batch_images = prepare_batch_images(
            batch=batch,
            image_processor=image_processor,
        ).cuda()
        batch_text = [get_prompt(s) for s in batch]
        encodings = tokenizer(
            batch_text,
            return_tensors="pt",
            padding="longest",
            truncation=True,
            max_length=2000,
        )
        input_ids = encodings["input_ids"].cuda()
        attention_mask = encodings["attention_mask"].cuda()
        image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
        image_start_index_list = [[x] for x in image_start_index_list]
        image_nums = [1] * len(input_ids)
        with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
            outputs = model.generate(
                batch_images,
                input_ids,
                attention_mask=attention_mask,
                max_new_tokens=10,
                min_length=1,
                num_beams=1,
                # length_penalty=0,
                image_start_index_list=image_start_index_list,
                image_nums=image_nums,
                added_bbox_list=None,
                return_dict_in_generate=True,
                output_scores=True,
            )
        scores = outputs.scores
        outputs = outputs.sequences[:, len(input_ids[0]) :]
        if object_token_id in scores[0][0].sort(descending=True).indices[:5]:
            sample = batch[0]
            # print("="*80)
            # print("sample:", batch, scores[0][0].sort(descending=True).indices[:10].tolist().index(object_token_id))
            prompt1 = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>Question: {sample['question'].strip()} Short answer:<|#object#|><|#previsual#|>"]
            boxes, scores = get_bbox(None, batch_images, prompt1, model, tokenizer, media_token_id, prebox_token_id, return_all=True)
            # open_cv_image = np.array(sample["image"])
            # open_cv_image = open_cv_image[:, :, ::-1].copy()
            # cv2.imwrite(f"Atest_ori.png", open_cv_image)
            # open_cv_image = cv2.rectangle(open_cv_image, boxes[0][:2].astype(int), boxes[0][2:].astype(int), (0, 255, 0), 2)
            # print(scores)
            # cv2.imwrite(f"Atest.png", open_cv_image)
            if boxes is not None and len(boxes) > 0:
                prompt2 = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>Question: {sample['question'].strip()} Short answer: it is<|#object#|><|#previsual#|><|#prebox#|><|#object#|> a"]
                encodings = tokenizer(
                    prompt2,
                    return_tensors="pt",
                    padding="longest",
                    truncation=True,
                    max_length=2000,
                )
                input_ids = encodings["input_ids"].cuda()
                attention_mask = encodings["attention_mask"].cuda()
                image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
                image_start_index_list = [[x] for x in image_start_index_list]
                image_nums = [1] * len(input_ids)
                added_bbox_list = [torch.tensor(boxes[0]/224.0).cuda().unsqueeze(0).clamp(0, 0.99)]
                with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
                    outputs = model.generate(
                        batch_images,
                        input_ids,
                        attention_mask=attention_mask,
                        max_new_tokens=10,
                        min_length=1,
                        num_beams=1,
                        image_start_index_list=image_start_index_list,
                        image_nums=image_nums,
                        added_bbox_list=added_bbox_list,
                        eos_token_id=(endofobject_token_id),
                    )
                outputs = outputs[:, len(input_ids[0]) :]
                # print("previsual===>{}".format(tokenizer.decode(outputs[0], skip_special_tokens=True).strip().lower().strip(string.punctuation+" ")))

        # postprocess begin
        new_predictions = [
            out.strip().lower().strip(string.punctuation+" ") for out in tokenizer.batch_decode(outputs, skip_special_tokens=True)
        ]
        this_tot += 1
        predictions.extend(
            [
                {"answer": p, "question_id": sample["question_id"], "_question": sample["question"], "answers": sample["answers"]}
                for p, sample in zip(new_predictions, batch)
            ]
        )
    with open(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank}_{id}.json", "w") as f:
        f.write(json.dumps(predictions))
    print("save to", f"{vqa_dataset}_{lang_encoder_name}_results_part{rank}_{id}.json")

    time.sleep(10)
    if world_size > 1:
        torch.distributed.barrier()
    if rank == 0:
        print(f"evaluate on rank {rank}. world size is {world_size}")
        predictions = []
        for rank_i in range(world_size):
            print("load", f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json")
            predictions.extend(json.load(open(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json")))
            os.remove(f"{vqa_dataset}_{lang_encoder_name}_results_part{rank_i}_{id}.json")
        print("num:", len(predictions))
        # save the predictions to a temporary file
        random_uuid = str(uuid.uuid4())
        with open(f"{vqa_dataset}results_{random_uuid}.json", "w") as f:
            f.write(json.dumps(predictions, indent=4))

        acc = compute_gqa_accuracy(predictions)
        print(vqa_dataset, "score:", acc, "| save to", f"{vqa_dataset}results_{random_uuid}.json")
        os.makedirs("eval_results", exist_ok=True)
        with open(os.path.join("eval_results", f"{vqa_dataset}_{model.expr_name}_{model.step_num}_{int(time.time())}_{acc}"), "w") as f:
            f.write(json.dumps(predictions, indent=2))

        # delete the temporary file
        os.remove(f"{vqa_dataset}results_{random_uuid}.json")
    else:
        time.sleep(5)
        acc = 0.0
    if world_size > 1:
        torch.distributed.barrier()
    return acc