Spaces:

alexnasa
/

XVerse

Running on Zero

File size: 8,321 Bytes

4479f79

# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
from copy import deepcopy
from collections import defaultdict
import numpy as np
import pandas as pd
import os

class MPLUG(torch.nn.Module):
    def __init__(self, ckpt='damo/mplug_visual-question-answering_coco_large_en', device='gpu'):
        super().__init__()
        from modelscope.pipelines import pipeline
        from modelscope.utils.constant import Tasks
        self.pipeline_vqa = pipeline(Tasks.visual_question_answering, model=ckpt, device=device)

    def vqa(self, image, question):
        input_vqa = {'image': image, 'question': question}
        result = self.pipeline_vqa(input_vqa)
        return result['text']


class DPGScore:
    def __init__(self, device):
        self.device = device
        ckpt = os.getenv('DPG_VQA_MODEL_PATH', "xingjianleng/mplug_visual-question-answering_coco_large_en")
        self.vqa_model = MPLUG(ckpt, device=self.device)

        
    def __call__(self, image, q_dict):
        VQA = self.vqa_model
        qid2tuple, qid2dependency, qid2question = q_dict['qid2tuple'], q_dict['qid2dependency'], q_dict['qid2question']
        qid2answer = {}
        qid2scores = {}

        for id, question in qid2question.items():
            id = str(id)
            answer = VQA.vqa(image, question)
            qid2answer[id] = answer
            qid2scores[id] = float(answer == 'yes')
                
        average_score_without_dep = sum(qid2scores.values()) / len(qid2scores)
            
        qid2validity = {}
        qid2scores_after_filtering = deepcopy(qid2scores)

        for id, parent_ids in qid2dependency.items():
            id = str(id)
            any_parent_answered_no = False
            for parent_id in parent_ids:
                parent_id = str(parent_id)
                if int(parent_id) == 0:
                    continue
                if parent_id in qid2scores:
                    if qid2scores[parent_id] == 0:
                        any_parent_answered_no = True
                        break
            if any_parent_answered_no:
                qid2scores_after_filtering[id] = 0.0
                qid2validity[id] = False
            else:
                qid2validity[id] = True

        average_score_with_dep = sum(qid2scores_after_filtering.values()) / len(qid2scores)
        return {
            'qid2tuple': qid2tuple,
            'qid2dependency': qid2dependency,
            'qid2question': qid2question,
            'qid2answer': qid2answer,
            'qid2scores': qid2scores,
            'qid2validity': qid2validity,
            'average_score_with_dependency': average_score_with_dep * 100.,
            'average_score_without_dependency': average_score_without_dep * 100.
        }


def prepare_dpg_data(csv_path):
    previous_id = ''
    current_id = ''
    question_dict = dict()
    category_count = defaultdict(int)
    data = pd.read_csv(csv_path)
    for i, line in data.iterrows():
        if i == 0:
            continue

        current_id = line.item_id
        qid = str(line.proposition_id)
        dependency_list_str = line.dependency.split(',')
        dependency_list_int = []
        for d in dependency_list_str:
            d_int = str(d.strip())
            dependency_list_int.append(d_int)

        if current_id == previous_id:
            question_dict[current_id]['qid2tuple'][qid] = line.tuple
            question_dict[current_id]['qid2dependency'][qid] = dependency_list_int
            question_dict[current_id]['qid2question'][qid] = line.question_natural_language
        else:
            question_dict[current_id] = dict(
                qid2tuple={qid: line.tuple},
                qid2dependency={qid: dependency_list_int},
                qid2question={qid: line.question_natural_language})
        
        category = line.question_natural_language.split('(')[0].strip()
        category_count[category] += 1
        
        previous_id = current_id
    return question_dict



if __name__ == "__main__":
    import os
    import time
    import shutil
    import argparse
    from PIL import Image
    from tqdm import tqdm
    from src.train.data.data_utils import split_grid, json_load, json_dump
    from src.train.train_utils import get_train_config, get_rank_and_worldsize
    from src.train.data.validation import *

    def parse_args():
        parser = argparse.ArgumentParser()
        parser.add_argument("--image_dir", type=str, default="")
        args = parser.parse_args()
        return args

    args = parse_args()

    local_rank, global_rank, world_size = get_rank_and_worldsize()
    print(f"local_rank={local_rank}, global_rank={global_rank}, world_size={world_size}")
    is_local_main_process = local_rank == 0
    is_main_process = global_rank == 0

    images = sorted(glob(f"{args.image_dir}/*.png"))

    if world_size > 1:
        num_per_rank = round(len(images) / world_size)
        images = images[global_rank*num_per_rank:(global_rank+1)*num_per_rank]
        os.environ['CUDA_VISIBLE_DEVICES'] = str(local_rank % 8)
        print(f"[rank {global_rank}/{world_size}] has {len(images)} prompts to process, using device {torch.cuda.current_device()}")

    run_name = time.strftime("%Y%m%d-%H")
    temp_dir = os.path.join(args.image_dir, f"eval_temp_{run_name}")

    if global_rank == 0:
        if os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)
        os.makedirs(temp_dir)

    dpg_score_model = DPGScore("cuda")
    q_dicts = prepare_dpg_data(f"eval/dpg/dpg_bench.csv")

    rank_json = {}
    with torch.no_grad():
        for image_path in tqdm(images):
            prompt_name = os.path.splitext(os.path.basename(image_path))[0]
            q_dict = q_dicts[prompt_name]
            images = split_grid(Image.open(image_path))
            rank_json[prompt_name] = []
            for i, img in enumerate(images):
                rank_json[prompt_name].append({})
                result = dpg_score_model(img, q_dict)
                for q_id, question in result["qid2question"].items():
                    answer = result["qid2answer"][q_id]
                    rank_json[prompt_name][i][question] = answer
                rank_json[prompt_name][i]['average_score_with_dependency'] = result['average_score_with_dependency']
                rank_json[prompt_name][i]['average_score_without_dependency'] = result['average_score_without_dependency']

    rank_save_path = os.path.join(temp_dir, f"scores_{global_rank}.json")
    json_dump(rank_json, rank_save_path, "utf-8")

    if global_rank == 0:
        while len(glob(os.path.join(temp_dir, f"scores_*.json"))) < world_size:
            time.sleep(5)
        time.sleep(5) # wait for the file writting to be finished
        merged_json = {}
        prompt_scores = {}
        scores = []
        for rank_path in glob(os.path.join(temp_dir, f"scores_*.json")):
            rank_json = json_load(rank_path, "utf-8")
            merged_json.update(rank_json)
            for prompt_name in rank_json:
                score_list = [x['average_score_with_dependency'] for x in rank_json[prompt_name]]
                prompt_scores[prompt_name] = np.mean(score_list)
                scores += score_list

        json_dump(merged_json, os.path.join(args.image_dir, f"dpg_scores_{run_name}.json"), "utf-8")
        dpg_score = np.mean(scores)
        lines_to_write = [
            f"DPG Score: {dpg_score:.2f}\n"
        ]
        print(lines_to_write[0])
        for prompt_name, score in prompt_scores.items():
            lines_to_write.append(f"{prompt_name}: {score:.2f}\n")

        with open(os.path.join(args.image_dir, f"dpg_scores_{run_name}.txt"), "w") as f:
            f.writelines(lines_to_write)

        shutil.rmtree(temp_dir)