from .image_base import ImageBaseDataset from .utils.judge_util import build_judge from ..smp import * from ..utils import track_progress_rich class ImageMTDataset(ImageBaseDataset): TYPE = 'MT' def build_prompt(self, line): if isinstance(line, int): line = self.data.iloc[line] if self.meta_only: tgt_path = toliststr(line['image_path']) else: tgt_path = self.dump_image(line) questions = toliststr(line['question']) if 'answer' in line: answers = toliststr(line['answer']) else: answers = [''] * len(questions) assert len(questions) == len(answers) dlgs, pics_number = [], 0 for i in range(len(questions)): q, a = questions[i], answers[i] if '' in q: content = [] tag_number = q.count('') images = tgt_path[pics_number: pics_number + tag_number] pics_number += tag_number q_split = q.split('') for i in range(tag_number): qsp, im = q_split[i], images[i] if qsp != '': content.append(dict(type='text', value=qsp)) content.append(dict(type='image', value=im)) if q_split[-1] != '': content.append(dict(type='text', value=q_split[-1])) else: content = [dict(type='text', value=q)] dlgs.append(dict(role='user', content=content)) assert '' not in a, 'We currently do not support images in the answer. ' content = [dict(type='text', value=a)] dlgs.append(dict(role='assistant', content=content)) return dlgs class MMDUDataset(ImageMTDataset): DATASET_URL = {'MMDU': 'https://opencompass.openxlab.space/utils/VLMEval/MMDU.tsv'} DATASET_MD5 = {'MMDU': '848b635a88a078f49aebcc6e39792061'} DIMS = [ 'Creativity', 'Richness', 'Visual Perception', 'Logical Coherence', 'Answer Accuracy', 'Image Relationship Understanding', 'Overall Score' ] def calculat_metric(self, ans): all = defaultdict(lambda: 0) tot = defaultdict(lambda: 0) valid = defaultdict(lambda: 0) for k in ans: res = ans[k]['res'] assert isinstance(res, pd.DataFrame) lt = len(res) for i in range(lt): line = res.iloc[i] for k in self.DIMS: tot[k] += 1 if k in line and line[k] is not None: try: score = int(line[k]) score = np.clip(score, 0, 10) all[k] += score valid[k] += 1 except Exception as e: print(f'Failed to parse the score: {str(e)}') sp1 = {'set': 'all'} sp1.update({k: all[k] / tot[k] * 10 for k in self.DIMS}) sp2 = {'set': 'valid'} sp2.update({k: all[k] / valid[k] * 10 for k in self.DIMS}) return pd.DataFrame([sp1, sp2]) def evaluate(self, eval_file, **judge_kwargs): suffix = eval_file.split('.')[-1] model = judge_kwargs['model'] tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv') nproc = judge_kwargs.pop('nproc', 4) data = load(eval_file) model = judge_kwargs.pop('model', 'gpt-4o') judge_model = build_judge(model=model, **judge_kwargs) lt = len(data) lines = [data.iloc[i] for i in range(lt)] tups = [(judge_model, line) for line in lines] indices = [line['index'] for line in lines] ans = {} if osp.exists(tmp_file): ans = load(tmp_file) tups = [x for x, i in zip(tups, indices) if i not in ans] indices = [i for i in indices if i not in ans] from .utils.mmdu import mmdu_score if len(indices): new_results = track_progress_rich( mmdu_score, tups, nproc=nproc, chunksize=nproc, keys=indices, save=tmp_file,) ans = load(tmp_file) for k, v in zip(indices, new_results): assert k in ans metric = self.calculat_metric(ans) dump(metric, score_file) return metric