BK-Lee commited on
Commit
6957169
·
1 Parent(s): 6f9e86a
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ from config import *
4
+ from PIL import Image
5
+ from utils.utils import *
6
+ from threading import Thread
7
+ import torch.nn.functional as F
8
+ from meteor.load_mmamba import load_mmamba
9
+ from meteor.load_meteor import load_meteor
10
+ from transformers import TextIteratorStreamer
11
+ from torchvision.transforms.functional import pil_to_tensor
12
+
13
+ # A100 Zero GPU
14
+ import spaces
15
+
16
+ # loading meteor model
17
+ mmamba = load_mmamba('BK-Lee/Meteor-Mamba').cuda()
18
+ meteor, tok_meteor = load_meteor('BK-Lee/Meteor-MLM', bits=4)
19
+
20
+ # device
21
+ device = torch.cuda.current_device()
22
+
23
+ # freeze model
24
+ freeze_model(mmamba)
25
+ freeze_model(meteor)
26
+
27
+ # previous length
28
+ previous_length = 0
29
+
30
+
31
+ def threading_function(inputs, image_token_number, streamer):
32
+ # Meteor Mamba
33
+ mmamba_inputs = mmamba.eval_process(inputs=inputs, tokenizer=tok_meteor, device=device, img_token_number=image_token_number)
34
+ if 'image' in mmamba_inputs.keys():
35
+ clip_features = meteor.clip_features(mmamba_inputs['image'])
36
+ mmamba_inputs.update({"image_features": clip_features})
37
+ mmamba_outputs = mmamba(**mmamba_inputs)
38
+
39
+ # Meteor
40
+ meteor_inputs = meteor.eval_process(inputs=inputs, data='demo', tokenizer=tok_meteor, device=device, img_token_number=image_token_number)
41
+ if 'image' in mmamba_inputs.keys():
42
+ meteor_inputs.update({"image_features": clip_features})
43
+ meteor_inputs.update({"tor_features": mmamba_outputs.tor_features})
44
+
45
+ generation_kwargs = meteor_inputs
46
+ generation_kwargs.update({'streamer': streamer})
47
+ generation_kwargs.update({'do_sample': True})
48
+ generation_kwargs.update({'max_new_tokens': 128})
49
+ generation_kwargs.update({'top_p': 0.95})
50
+ generation_kwargs.update({'temperature': 0.9})
51
+ generation_kwargs.update({'use_cache': True})
52
+ return meteor.generate(**generation_kwargs)
53
+
54
+ def add_message(history, message):
55
+ for x in message["files"]:
56
+ history.append(((x,), None))
57
+ if message["text"] is not None:
58
+ history.append((message["text"], None))
59
+ return history, gr.MultimodalTextbox(value=None, interactive=False)
60
+
61
+ @spaces.GPU(duration=120)
62
+ def bot_streaming(message, history):
63
+
64
+ # prompt type -> input prompt
65
+ image_token_number = int((490/14)**2)
66
+ if len(message['files']) != 0:
67
+ # Image Load
68
+ image = F.interpolate(pil_to_tensor(Image.open(message['files'][0]).convert("RGB")).unsqueeze(0), size=(490, 490), mode='bicubic').squeeze(0)
69
+ inputs = [{'image': image, 'question': message['text']}]
70
+ else:
71
+ inputs = [{'question': message['text']}]
72
+
73
+ # [4] Meteor Generation
74
+ with torch.inference_mode():
75
+ # kwargs
76
+ streamer = TextIteratorStreamer(tok_meteor, skip_special_tokens=True)
77
+
78
+ # Threading generation
79
+ thread = Thread(target=threading_function, kwargs=dict(inputs=inputs, image_token_number=image_token_number, streamer=streamer))
80
+ thread.start()
81
+
82
+ # generated text
83
+ generated_text = ""
84
+ for new_text in streamer:
85
+ generated_text += new_text
86
+ generated_text
87
+
88
+ # Text decoding
89
+ response = generated_text.split('assistant\n')[-1].split('[U')[0].strip()
90
+
91
+ buffer = ""
92
+ for character in response:
93
+ buffer += character
94
+ yield buffer
95
+
96
+ demo = gr.ChatInterface(fn=bot_streaming, title="Meteor",
97
+ description="Meteor",
98
+ stop_btn="Stop Generation", multimodal=True)
99
+ demo.launch(debug=True)
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
config.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OpenAI Key
2
+ OPENAI_KEY = ""
3
+
4
+ # Dataset root
5
+ DATASET_ROOT=""
6
+
7
+ # Pre Meteor Dataset
8
+ METEOR_DATASET= "Meteor.json"
9
+
10
+ # Various json and parquet files
11
+ SHAREGPT4V_CAPTION = "sharegpt4v_instruct_gpt4-vision_cap100k.json"
12
+ SHAREGPT4V_INSTRUCTION = "sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.json"
13
+ MINIGEMINI_INSTRUCTION = "minigemini_instruction.json"
14
+ DOCDOWNSTREAM = 'train.jsonl'
15
+ DOCREASON = 'detailed_explanation.jsonl'
16
+ GLLAVA_ALIGN = "gllava_align.parquet"
17
+ GLLAVA_QA = "gllava_qa.parquet"
18
+ MATHVISION = "mathvision.parquet"
19
+ MATHINSTRUCT = "MathInstruct.json"
20
+ MATHPLUS = "mathplus.parquet"
21
+
22
+ # Json files for Evaluation
23
+ VQAV2 = "VQAv2/v2_OpenEnded_mscoco_test2015_questions.json"
24
+ GQA = "gqa/testdev_balanced_questions.json"
25
+ SQA = "ScienceQA/problems.json"
26
+ SQA_SPLIT = "ScienceQA/pid_splits.json"
27
+ VIZWIZ = "VizWiz/test.json"
28
+ TEXTVQA = "TextVQA/llava_textvqa_val_v051_ocr.json"
29
+ TEXTVQA_ANNOTATIONS = "TextVQA/TextVQA_0.5.1_val.json"
30
+ POPE_POPULAR = "POPE/coco_pope_popular.json"
31
+ POPE_ADVERSARIAL = "POPE/coco_pope_adversarial.json"
32
+ POPE_RANDOM = "POPE/coco_pope_random.json"
33
+ MME = "MME_Benchmark_release_version/llava_mme.json"
34
+ MME_DIR = "MME_Benchmark_release_version"
35
+ MMBENCH = "MMBench/MMBench_TEST_EN_legacy.tsv"
36
+ MMBENCH_CN = "MMBench/MMBench_TEST_CN_legacy.tsv"
37
+ MMBENCH_DEV = "MMBench/mmbench_dev_20230712.tsv"
38
+ MMBENCH_CN_DEV = "MMBench/mmbench_dev_cn_20231003.tsv"
39
+ QBENCH = "LLVisionQA-QBench/llvisionqa_dev.json"
40
+ QBENCH_CN = "LLVisionQA-QBench/质衡-问答-验证集.json"
41
+ MMVET = "mm-vet/mm-vet.json"
42
+ MMMU = "MMMU/*/validation*"
43
+ MATHVISTA = "MathVista/testmini-00000-of-00001-725687bf7a18d64b.parquet"
44
+ AI2D = "ai2d/ai2d_test.json"
45
+ HALLUSIONBENCH = "HallusionBench/HallusionBench.json"
46
+ CHARTQA = "chartqa/test/test_augmented.json"
47
+ SEED = "SEED-Bench/SEED-Bench.json"
48
+ LLAVA = "llava-bench-in-the-wild/questions.jsonl"
49
+ # BLINK =
50
+ MATHVERSE = "MathVerse/testmini.json"
51
+ MATHVERSE_TEXT_ONLY = "MathVerse/testmini_text_only.json"
52
+ MMSTAR = "MMStar/mmstar.parquet"
53
+
54
+ # Available evaluation datasets
55
+ EVAL_DATASETS = ["qbench", "sqa", "ai2d", "chartqa", "seed", "pope", "hallusionbench", "mme", \
56
+ "mathvista", "mmbench", "mmbench_cn", "mmvet", "llava", "mmstar", "mathverse"]
57
+
eval/create_evaluator.py ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import shortuuid
5
+ import numpy as np
6
+ import pandas as pd
7
+ from config import *
8
+ from collections import defaultdict
9
+ from eval.utils import *
10
+
11
+ class BaseEvaluator:
12
+ def __init__(self):
13
+ super(BaseEvaluator, self).__init__()
14
+
15
+ # Create evaluation results folder
16
+ self.save_dir = os.path.join(DATASET_ROOT, "eval_results")
17
+ if not os.path.exists(self.save_dir):
18
+ os.makedirs(self.save_dir)
19
+
20
+ def reset(self):
21
+ # Reset results for new dataset evaluation
22
+ self.gen_answers = []
23
+ self.inputs = []
24
+
25
+ def process(self, inputs, outputs):
26
+ # Merge results
27
+ self.inputs.extend(inputs)
28
+ self.gen_answers.extend(outputs)
29
+
30
+ class Evaluator(BaseEvaluator):
31
+ def __init__(self):
32
+ """
33
+ Eval Datasets
34
+
35
+ - VQAv2
36
+ - GQA
37
+ - SQA-IMG
38
+ - VizWiz
39
+ - TextVQA
40
+ - POPE
41
+ - MME
42
+ - MMBench
43
+ - MMBench-CN
44
+ - QBench
45
+ - MM-Vet
46
+ - MMMU
47
+ - MathVista
48
+ - AI2D
49
+ - HallusionBench
50
+ - ChartQA
51
+ - SEED
52
+ - LLaVA Wild
53
+ - BLINK
54
+ - MathVerse
55
+
56
+ """
57
+
58
+ super().__init__()
59
+
60
+ def evaluate(self, model, dataset, accel):
61
+
62
+ # gathering all gpu to one device
63
+ self.inputs = accel.gather_for_metrics(self.inputs)
64
+ self.gen_answers = accel.gather_for_metrics(self.gen_answers)
65
+
66
+ if accel.is_main_process:
67
+ # check for duplicates
68
+ self.inputs, self.gen_answers = remove_duplicate(dataset, self.inputs, self.gen_answers)
69
+
70
+ # Select evaluation for dataset
71
+ if dataset == "vqav2":
72
+ return self.evaluate_vqa(model, accel)
73
+ elif dataset == "gqa":
74
+ return self.evaluate_gqa(model, accel)
75
+ elif dataset == "sqa":
76
+ return self.evaluate_sqa(model, accel)
77
+ elif dataset == "vizwiz":
78
+ return self.evaluate_vizwiz(model, accel)
79
+ elif dataset == "textvqa":
80
+ return self.evaluate_textvqa(model, accel)
81
+ elif dataset == "pope":
82
+ return self.evaluate_pope(model, accel)
83
+ elif dataset == "mme":
84
+ return self.evaluate_mme(model, accel)
85
+ elif dataset == "mmbench":
86
+ return self.evaluate_mmbench(model, accel)
87
+ elif dataset == "mmbench_dev":
88
+ return self.evaluate_mmbench_dev(model, accel)
89
+ elif dataset == "mmbench_cn":
90
+ return self.evaluate_mmbench_cn(model, accel)
91
+ elif dataset == "mmbench_cn_dev":
92
+ return self.evaluate_mmbench_cn_dev(model, accel)
93
+ elif dataset == "qbench":
94
+ return self.evaluate_qbench(model, accel)
95
+ elif dataset == "mm-vet":
96
+ return self.evaluate_mmvet(model, accel)
97
+ elif dataset == "mmmu":
98
+ return self.evaluate_mmmu(model, accel)
99
+ elif dataset == "mathvista":
100
+ return self.evaluate_mathvista(model, accel)
101
+ elif dataset == "ai2d":
102
+ return self.evaluate_ai2d(model, accel)
103
+ elif dataset == "hallusionbench":
104
+ return self.evaluate_hallusionbench(model, accel)
105
+ elif dataset == "chartqa":
106
+ return self.evaluate_chartqa(model, accel)
107
+ elif dataset == "seed":
108
+ return self.evaluate_seed(model, accel)
109
+ elif dataset == "llava":
110
+ return self.evaluate_llava(model, accel)
111
+ elif dataset == "blink":
112
+ return self.evaluate_blink(model, accel)
113
+ elif dataset == "mathverse":
114
+ return self.evaluate_mathverse(model, accel)
115
+ elif dataset == "mmstar":
116
+ return self.evaluate_mmstar(model, accel)
117
+ else:
118
+ raise ValueError(
119
+ f'{dataset} is not an available dataset.')
120
+ else:
121
+ return None
122
+
123
+ def evaluate_vqa(self, model, accel):
124
+ # VQAv2 Evaluation for EvalAI server
125
+ pred_answers = [{'question_id': inputs['id'], 'answer': answer} for inputs, answer in zip(self.inputs, self.gen_answers)]
126
+ pred_pth = os.path.join(self.save_dir, f"{model}_vqav2_results.json")
127
+ json.dump(pred_answers, open(pred_pth, "w"))
128
+ accel.print(f"Finished evaluating VQAv2. Evaluate the result file saved to {pred_pth} on EvalAI server.")
129
+ return
130
+
131
+ def evaluate_gqa(self, model, accel):
132
+ # GQA Evaluation
133
+ pred_answers = {inputs['id']: answer for inputs, answer in zip(self.inputs, self.gen_answers)}
134
+ # pred_answers = [{'question_id': inputs['id'], 'answer': answer} for inputs, answer in zip(self.inputs, self.gen_answers)]
135
+ pred_pth = os.path.join(self.save_dir, f"{model}_gqa_results.json")
136
+ json.dump(pred_answers, open(pred_pth, "w"))
137
+ accel.print("GQA Results:")
138
+ results = eval_gqa(pred_answers, json.load(open(os.path.join(DATASET_ROOT, GQA))))
139
+ return results['accuracy']
140
+
141
+ def evaluate_sqa(self, model, accel):
142
+ # SQA Evaluation
143
+ pred_answers = [{'question_id': inputs['id'], 'answer': convert_to_choice(answer, inputs['candidates']), 'gt': inputs['gt']} for inputs, answer in zip(self.inputs, self.gen_answers)]
144
+ pred_pth = os.path.join(self.save_dir, f"{model}_sqa_results.json")
145
+ json.dump(pred_answers, open(pred_pth, "w"))
146
+
147
+ # Compute accuracy
148
+ results = [(answer['answer'] == answer['gt']) for answer in pred_answers]
149
+ accel.print (f"SQA Accuracy: {np.mean(results)*100} %")
150
+ return np.mean(results)*100
151
+
152
+ def evaluate_vizwiz(self, model, accel):
153
+ # VizWiz Evaluation
154
+ evaluator = EvalAIAnswerProcessor()
155
+ pred_answers = [{'image': inputs['id'], 'answer': evaluator(answer)} for inputs, answer in zip(self.inputs, self.gen_answers)]
156
+ pred_pth = os.path.join(self.save_dir, f"{model}_vizwiz_results.json")
157
+ json.dump(pred_answers, open(pred_pth, "w"))
158
+ accel.print(f"Finished evaluating VizWiz. Evaluate the result file saved to {pred_pth} on EvalAI server.")
159
+ return
160
+
161
+ def evaluate_textvqa(self, model, accel):
162
+ # TextVQA Evaluation
163
+ pred_answers = [{'question_id': inputs['id'], 'pred_answer': answer, 'question': inputs['question'], 'gt_answers': inputs['gt']} for inputs, answer in zip(self.inputs, self.gen_answers)]
164
+ pred_pth = os.path.join(self.save_dir, f"{model}_textvqa_results.json")
165
+ json.dump(pred_answers, open(pred_pth, "w"))
166
+
167
+ evaluator = TextVQAAccuracyEvaluator()
168
+ results = evaluator.eval_pred_list(pred_answers)*100
169
+ accel.print (f"TextVQA Accuracy: {results} %")
170
+ return results
171
+
172
+ def evaluate_pope(self, model, accel):
173
+ # POPE Evaluation
174
+ pred_answers = [{'question_id': inputs['id'], 'answer': answer, 'question': inputs['question'], 'category': inputs['category']} for inputs, answer in zip(self.inputs, self.gen_answers)]
175
+ pred_pth = os.path.join(self.save_dir, f"{model}_pope_results.json")
176
+ json.dump(pred_answers, open(pred_pth, "w"))
177
+
178
+ pope_results = {}
179
+ pope_results['adversarial'] = None
180
+ pope_results['popular'] = None
181
+ pope_results['random'] = None
182
+
183
+ categories = ['adversarial', 'popular', 'random']
184
+ files = [POPE_ADVERSARIAL, POPE_POPULAR, POPE_RANDOM]
185
+
186
+ for category, file in zip(categories, files):
187
+ cur_answers = [x for x in pred_answers if x['category'] == category]
188
+ cur_answers = sorted(cur_answers, key=lambda x:x["question_id"])
189
+ pope_results[category] = eval_pope(cur_answers, os.path.join(DATASET_ROOT, file))
190
+ accel.print (f"POPE Adversarial Accuracy: {pope_results['adversarial']} %")
191
+ accel.print (f"POPE Popular Accuracy: {pope_results['popular']} %")
192
+ accel.print (f"POPE Random Accuracy: {pope_results['random']} %")
193
+ return pope_results
194
+
195
+ def evaluate_mme(self, model, accel):
196
+ # MME Evaluation
197
+ pred_answers = [{'question_id': inputs['id'], 'answer': answer, "question": inputs['question'], 'category': inputs['category']} for inputs, answer in zip(self.inputs, self.gen_answers)]
198
+ pred_pth = os.path.join(self.save_dir, f"{model}_mme_results.json")
199
+ json.dump(pred_answers, open(pred_pth, "w"))
200
+
201
+ ground_truth = get_gt(data_path=os.path.join(DATASET_ROOT, MME_DIR))
202
+ result_dir = os.path.join(self.save_dir, 'mme')
203
+ os.makedirs(result_dir, exist_ok=True)
204
+ results = defaultdict(list)
205
+
206
+ for answer in pred_answers:
207
+ file = answer['question_id'].split('/')[-1].split('.')[0] + '.txt'
208
+ results[answer['category']].append((file, answer['question'], answer['answer']))
209
+
210
+ for category, cate_tups in results.items():
211
+ with open(os.path.join(result_dir, f'{category}.txt'), 'w') as fp:
212
+ questions = set() # check for duplicates
213
+ for file, prompt, answer in cate_tups:
214
+ if 'Answer the question using a single word or phrase.' in prompt:
215
+ prompt = prompt.replace('Answer the question using a single word or phrase.', '').strip()
216
+ if 'Please answer yes or no.' not in prompt:
217
+ prompt = prompt + ' Please answer yes or no.'
218
+ if (category, file, prompt) not in ground_truth:
219
+ prompt = prompt.replace(' Please answer yes or no.', ' Please answer yes or no.')
220
+ gt_ans = ground_truth[category, file, prompt]
221
+ dup = file, prompt, gt_ans
222
+ tup = file, prompt, gt_ans, answer
223
+ if dup in questions:
224
+ continue
225
+ questions.add(dup)
226
+ fp.write('\t'.join(tup) + '\n')
227
+
228
+ evaluator = MMEEvaluator()
229
+ scores = evaluator.process_result(result_dir)
230
+ accel.print("MME Scores:")
231
+ accel.print(scores)
232
+ for eval_type, eval_scores in scores.items():
233
+ accel.print("===========", eval_type, "===========")
234
+ accel.print("total score:", eval_scores['total'], "\n")
235
+ for task_name, score in eval_scores.items():
236
+ accel.print("\t", task_name, " score:", score)
237
+ accel.print("\n")
238
+ return scores
239
+
240
+ def evaluate_mmbench(self, model, accel):
241
+ # MMBench Evaluation
242
+ df = pd.read_table(os.path.join(DATASET_ROOT, MMBENCH))
243
+ cur_df = df.copy()
244
+ cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category'])
245
+ cur_df.insert(6, 'prediction', None)
246
+ for inputs, answer in zip(self.inputs, self.gen_answers):
247
+ cur_df.loc[df['index'] == inputs['id'], 'prediction'] = answer
248
+ pred_pth = os.path.join(self.save_dir, f"{model}_mmbench_results.xlsx")
249
+ cur_df.to_excel(pred_pth, index=False, engine='openpyxl')
250
+ accel.print(f"Finished evaluating MMBench. Change {pred_pth} name to submission.xlsx and evaluate the result file saved to {pred_pth} on OpenCompass server.")
251
+ return
252
+
253
+ def evaluate_mmbench_dev(self, model, accel):
254
+ # MMBench Dev Evaluation
255
+ df = pd.read_table(os.path.join(DATASET_ROOT, MMBENCH_DEV))
256
+ cur_df = df.copy()
257
+ cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category'])
258
+ cur_df.insert(6, 'prediction', None)
259
+ for inputs, answer in zip(self.inputs, self.gen_answers):
260
+ cur_df.loc[df['index'] == inputs['id'], 'prediction'] = answer[0]
261
+ pred_pth = os.path.join(self.save_dir, f"{model}_mmbench_dev_results.xlsx")
262
+ cur_df.to_excel(pred_pth, index=False, engine='openpyxl')
263
+ accuracy = (cur_df['prediction'] == cur_df['answer']).mean()
264
+ accel.print(f'MMBench_dev Accuracy: {accuracy:.2%}')
265
+ return
266
+
267
+ def evaluate_mmbench_cn(self, model, accel):
268
+ # MMBench_CN Evaluation
269
+ df = pd.read_table(os.path.join(DATASET_ROOT, MMBENCH_CN))
270
+ cur_df = df.copy()
271
+ cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category'])
272
+ cur_df.insert(6, 'prediction', None)
273
+ for inputs, answer in zip(self.inputs, self.gen_answers):
274
+ cur_df.loc[df['index'] == inputs['id'], 'prediction'] = answer
275
+ pred_pth = os.path.join(self.save_dir, f"{model}_mmbench_cn_results.xlsx")
276
+ cur_df.to_excel(pred_pth, index=False, engine='openpyxl')
277
+ accel.print(f"Finished evaluating MMBench_CN. Change {pred_pth} name to submission.xlsx and evaluate the result file saved to {pred_pth} on OpenCompass server.")
278
+ return
279
+
280
+ def evaluate_mmbench_cn_dev(self, model, accel):
281
+ # MMBench_CN Dev Evaluation
282
+ df = pd.read_table(os.path.join(DATASET_ROOT, MMBENCH_CN_DEV))
283
+ cur_df = df.copy()
284
+ cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category'])
285
+ cur_df.insert(6, 'prediction', None)
286
+ for inputs, answer in zip(self.inputs, self.gen_answers):
287
+ cur_df.loc[df['index'] == inputs['id'], 'prediction'] = answer[0]
288
+ pred_pth = os.path.join(self.save_dir, f"{model}_mmbench_cn_dev_results.xlsx")
289
+ cur_df.to_excel(pred_pth, index=False, engine='openpyxl')
290
+ accuracy = (cur_df['prediction'] == cur_df['answer']).mean()
291
+ accel.print(f'MMBench_CN_dev Accuracy: {accuracy:.2%}')
292
+ return
293
+
294
+ def evaluate_qbench(self, model, accel):
295
+ # QBench Evaluation
296
+ pred_answers = [{'id': inputs['id'], 'answer': convert_to_choice(answer, inputs['candidates']), 'gt': inputs['gt'], 'candidates': inputs['candidates']} for inputs, answer in zip(self.inputs, self.gen_answers)]
297
+ pred_pth = os.path.join(self.save_dir, f'{model}_qbench_results.jsonl')
298
+ with open(pred_pth, "w") as pf:
299
+ pf.write(json.dumps(pred_answers) + "\n")
300
+
301
+ results = [(pred['candidates'][pred['answer']] == pred['gt']) for pred in pred_answers]
302
+ accel.print (f"QBench Accuracy: {np.mean(results)*100} %")
303
+ return np.mean(results)*100
304
+
305
+ def evaluate_mmvet(self, model, accel):
306
+ # MM-Vet Evaluation
307
+ cur_result = {f"{inputs['id']}": answer for inputs, answer in zip(self.inputs, self.gen_answers)}
308
+ pred_pth = os.path.join(self.save_dir, f'{model}_mmvet_results.json')
309
+ with open(pred_pth, 'w') as f:
310
+ json.dump(cur_result, f, indent=2)
311
+
312
+ accel.print(f"Finished evaluating MM-Vet. Evaluate the result file saved to {pred_pth}.")
313
+ return
314
+
315
+ def evaluate_mmmu(self, model, accel):
316
+ # MMMU Evaluation
317
+ predictions = {inputs['id']: answer for inputs, answer in zip(self.inputs, self.gen_answers)}
318
+ answers = {inputs['id']: {'ground_truth': inputs['gt'], 'question_type': inputs['question_type']} for inputs, answer in zip(self.inputs, self.gen_answers)}
319
+ pred_pth = os.path.join(self.save_dir, f'{model}_mmmu_results.json')
320
+ with open(pred_pth, "w") as f:
321
+ json.dump(predictions, f, indent=2)
322
+ ans_pth = os.path.join(self.save_dir, 'mmmu_answers.json')
323
+ with open(ans_pth, "w") as pf:
324
+ json.dump(answers, pf, indent=2)
325
+
326
+ # group by category
327
+ output_dict_w_cat = {}
328
+ for data_id, parsed_pred in predictions.items():
329
+ category = "_".join(data_id.split("_")[1:-1])
330
+ if category not in output_dict_w_cat:
331
+ output_dict_w_cat.update({category: {}})
332
+ output_dict_w_cat[category].update({data_id: parsed_pred})
333
+
334
+ # group by category
335
+ answer_dict_w_cat = {}
336
+ for data_id, parsed_pred in answers.items():
337
+ category = "_".join(data_id.split("_")[1:-1])
338
+ if category not in answer_dict_w_cat:
339
+ answer_dict_w_cat.update({category: {}})
340
+ answer_dict_w_cat[category].update({data_id: parsed_pred})
341
+
342
+ evaluation_result = {}
343
+
344
+ for category in CAT_SHORT2LONG.values():
345
+ accel.print("Evaluating: {}".format(category))
346
+ # get cat_outputs and cat_answers
347
+ try:
348
+ cat_outputs = output_dict_w_cat[category]
349
+ cat_answers = answer_dict_w_cat[category]
350
+ except KeyError:
351
+ accel.print("Skipping {} for not found".format(category))
352
+ continue
353
+
354
+ exampels_to_eval = []
355
+ for data_id, parsed_pred in cat_outputs.items():
356
+ question_type = cat_answers[data_id]['question_type']
357
+ if question_type != 'multiple-choice':
358
+ parsed_pred = parse_open_response(parsed_pred) # mainly for type consistency (make it number, etc.)
359
+ else:
360
+ parsed_pred = parsed_pred
361
+
362
+ exampels_to_eval.append({
363
+ "id": data_id,
364
+ "question_type": question_type,
365
+ "answer": cat_answers[data_id]['ground_truth'],
366
+ "parsed_pred": parsed_pred
367
+ })
368
+
369
+ judge_dict, metric_dict = evaluate(exampels_to_eval)
370
+ metric_dict.update({"num_example": len(exampels_to_eval)})
371
+
372
+ evaluation_result[category] = metric_dict
373
+
374
+ printable_results = {}
375
+ # add domain Subject
376
+ for domain, in_domain_cats in DOMAIN_CAT2SUB_CAT.items():
377
+ in_domain_cat_results = {}
378
+ for cat_name in in_domain_cats: # use the order in DOMAIN_CAT2SUB_CAT
379
+ if cat_name in evaluation_result.keys():
380
+ in_domain_cat_results[cat_name] = evaluation_result[cat_name]
381
+ else:
382
+ pass
383
+ in_domain_ins_acc = calculate_ins_level_acc(in_domain_cat_results)
384
+ in_domain_data_num = sum([cat_results['num_example'] for cat_results in in_domain_cat_results.values()])
385
+ printable_results['Overall-' + domain] = {"num": int(in_domain_data_num),
386
+ "acc": round(in_domain_ins_acc, 3)
387
+ }
388
+ # add sub category
389
+ for cat_name, cat_results in in_domain_cat_results.items():
390
+ printable_results[cat_name] = {"num": int(cat_results['num_example']),
391
+ "acc": round(cat_results['acc'], 3)
392
+ }
393
+
394
+ # table.append(["-----------------------------", "-----", "----"])
395
+ all_ins_acc = calculate_ins_level_acc(evaluation_result)
396
+ printable_results['Overall'] = {"num": sum([cat_results['num_example'] for cat_results in evaluation_result.values()]),
397
+ "acc": round(all_ins_acc, 3)
398
+ }
399
+
400
+ accel.print(printable_results)
401
+ return
402
+
403
+ def evaluate_mathvista(self, model, accel):
404
+ # MathVista Evaluation
405
+ pred_answers = [{'pid': inputs['id'], 'image': inputs['id'], 'response': answer,
406
+ 'question_type': inputs['question_type'], 'answer_type': inputs['answer_type'], 'metadata': inputs['metadata'],
407
+ 'choices': inputs['choices'], 'query': inputs['question'], 'precision': inputs['precision'],} for inputs, answer in zip(self.inputs, self.gen_answers)]
408
+ predictions = {pred['pid']: pred for pred in pred_answers}
409
+ pred_pth = os.path.join(self.save_dir, f"{model}_mathvista_results.json")
410
+ json.dump(predictions, open(pred_pth, "w"))
411
+
412
+ accel.print(f"Finished evaluating MathVista. Evaluate the result file saved to {pred_pth}.")
413
+ return
414
+
415
+ def evaluate_ai2d(self, model, accel):
416
+ # AI2D Evaluation
417
+ pred_answers = [{'question_id': inputs['id'], 'answer': answer, 'gt': inputs['gt']} for inputs, answer in zip(self.inputs, self.gen_answers)]
418
+ pred_pth = os.path.join(self.save_dir, f"{model}_ai2d_results.json")
419
+ json.dump(pred_answers, open(pred_pth, "w"))
420
+
421
+ # Compute accuracy
422
+ pattern = re.compile(r'[A-Z]')
423
+ results = [(char_to_int(pattern.findall(answer)[0]) == inputs['gt']) for inputs, answer in zip(self.inputs, self.gen_answers)]
424
+
425
+ accel.print(f"AI2D Accuracy: {np.mean(results)*100} %")
426
+ return np.mean(results)*100
427
+
428
+ def evaluate_hallusionbench(self, model, accel):
429
+ # HallusionBench Evaluation
430
+ pred_answers = [{'answer': '1' if answer.lower().find('yes') != -1 else '0', 'question': inputs['question'], 'gt': inputs['gt']} for inputs, answer in zip(self.inputs, self.gen_answers)]
431
+ pred_pth = os.path.join(self.save_dir, f"{model}_hallusionbench_results.json")
432
+ json.dump(pred_answers, open(pred_pth, "w"))
433
+
434
+ # Compute accuracy
435
+ results = [(answer['answer'] == answer['gt']) for answer in pred_answers]
436
+ accel.print(f"HallusionBench Accuracy: {np.mean(results)*100} %")
437
+ return np.mean(results)*100
438
+
439
+ def evaluate_chartqa(self, model, accel):
440
+ # ChartQA Evaluation
441
+ # post processing
442
+ processed_answers = []
443
+ for x in self.gen_answers:
444
+ if any(i.isdigit() for i in x):
445
+ processed_answers.append(x.split(" ")[0])
446
+ else:
447
+ processed_answers.append(x)
448
+ pred_answers = [{'answer': answer, 'question': inputs['question'], 'annotation': inputs['gt']} for inputs, answer in zip(self.inputs, processed_answers)]
449
+ pred_pth = os.path.join(self.save_dir, f"{model}_chartqa_results.json")
450
+ json.dump(pred_answers, open(pred_pth, "w"))
451
+
452
+ # Compute accuracy
453
+ acc = evaluate_relaxed_accuracy(pred_answers)
454
+ accel.print(f"ChartQA Accuracy: {acc*100}%")
455
+ return acc
456
+
457
+ def evaluate_seed(self, model, accel):
458
+ # SEED Evaluation
459
+ pred_answers = [{'answer': answer, 'question': inputs['question'], 'question_id': inputs['id'], 'gt': inputs['gt'], 'question_type': inputs['question_type']} for inputs, answer in zip(self.inputs, self.gen_answers)]
460
+ pred_pth = os.path.join(self.save_dir, f"{model}_seed_results.json")
461
+ json.dump(pred_answers, open(pred_pth, "w"))
462
+
463
+ # Compute accuracy
464
+ results = [(answer['answer'] == answer['gt']) for answer in pred_answers]
465
+ accel.print (f"SEED Accuracy: {np.mean(results)*100} %")
466
+
467
+ # Per question type accuracy
468
+ for k, v in SEED_TYPES.items():
469
+ sub_results = []
470
+ for pred in pred_answers:
471
+ if pred['question_type'] == k:
472
+ sub_results.append(pred['answer'] == pred['gt'])
473
+ accel.print (f"{v}: {np.mean(sub_results)*100} %")
474
+
475
+ return np.mean(results)*100
476
+
477
+ def evaluate_llava(self, model, accel):
478
+ # LLaVA-in-the-Wild Evaluation
479
+ pred_answers = [{'question_id': inputs['id'], 'prompt': inputs['question'], 'text': answer, "answer_id": shortuuid.uuid()} for inputs, answer in zip(self.inputs, self.gen_answers)]
480
+ sorted_answers = sorted(pred_answers, key=lambda x: x['question_id'])
481
+ pred_pth = os.path.join(self.save_dir, f'{model}_llava_results.jsonl')
482
+ ans_file = open(pred_pth, "w")
483
+ for pred in sorted_answers:
484
+ ans_file.write(json.dumps(pred) + "\n")
485
+ ans_file.flush()
486
+ ans_file.close()
487
+
488
+ accel.print(f"Finished evaluating LLaVA-in-the-wild. Evaluate the result file saved to {pred_pth}.")
489
+ return
490
+
491
+ def evaluate_blink(self, model, accel):
492
+ # BLINK Evaluation
493
+ # TODO
494
+ return
495
+
496
+ def evaluate_mathverse(self, model, accel):
497
+ # Mathverse Evaluation
498
+ pred_answers = [{'sample_index' : inputs['id'], 'problem_index' : inputs['problem_index'], 'problem_version' : inputs['problem_version'],
499
+ 'question' : inputs['origin_question'], 'answer' : inputs['gt'],
500
+ 'question_type': inputs['question_type'], 'question_type': inputs['question_type'],
501
+ 'metadata': inputs['metadata'], 'query_wo': inputs['question'], 'query_cot' : inputs['query_cot'], 'model_answer' : answer} for inputs, answer in zip(self.inputs, self.gen_answers)]
502
+
503
+ # answers = [item for item in pred_answers if item['problem_version'] != 'Text_Only']
504
+ # text_only_answers = [item for item in pred_answers if item['problem_version'] == 'Text_Only']
505
+
506
+ pred_pth = os.path.join(self.save_dir, f'{model}_mathverse_results.json')
507
+ json.dump(pred_answers, open(pred_pth, "w"))
508
+ pred_pth = os.path.join(self.save_dir, f'{model}_mathverse_scores.json')
509
+ eval_mathverse(self.save_dir, pred_answers,f'{model}_mathverse_extracts.json', f'{model}_mathverse_scores.json')
510
+ accel.print(f"Finished evaluating MathVerse. Evaluate the result file saved to {pred_pth}.")
511
+ # TODO
512
+ return
513
+
514
+ def evaluate_mmstar(self, model, accel):
515
+ pred_answers = [{'question': inputs['question'],
516
+ 'answer': inputs['answer'],
517
+ 'category': inputs['category'],
518
+ 'l2_category': inputs['l2_category'],
519
+ # 'bench': inputs['bench'],
520
+ 'prediction' : answer} for inputs, answer in zip(self.inputs, self.gen_answers)]
521
+
522
+ pred_pth = os.path.join(self.save_dir, f'{model}_mmstar_results.json')
523
+ json.dump(pred_answers, open(pred_pth, "w"))
524
+
525
+ df = pd.DataFrame(pred_answers)
526
+
527
+ eval_mmstar(df, self.save_dir, f'{model}_mmstar_scores.json')
528
+ pred_pth = os.path.join(self.save_dir, f'{model}_mmstar_scores.json')
529
+ accel.print(f"Finished evaluating MMStar. Evaluate the result file saved to {pred_pth}.")
eval/llavabench/eval.sh ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python eval/llavabench/eval_gpt_review_bench.py \
2
+ --question /mnt/ssd/lbk-cvpr/dataset/llava-bench-in-the-wild/questions.jsonl \
3
+ --context /mnt/ssd/lbk-cvpr/dataset/llava-bench-in-the-wild/context.jsonl \
4
+ --rule eval/llavabench/rule.json \
5
+ --answer-list \
6
+ /mnt/ssd/lbk-cvpr/dataset/llava-bench-in-the-wild/answers_gpt4.jsonl \
7
+ /mnt/ssd/lbk-cvpr/dataset/eval_results/Meteor_llava_results.jsonl \
8
+ --output \
9
+ /mnt/ssd/lbk-cvpr/dataset/eval_results/reviews_meteor_llava_results_step3.jsonl
10
+
11
+ python eval/llavabench/summarize_gpt_review.py -f /mnt/ssd/lbk-cvpr/dataset/eval_results/reviews_meteor_llava_results_step3.jsonl
eval/llavabench/eval_gpt_review_bench.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+
5
+ import openai
6
+ import time
7
+
8
+ NUM_SECONDS_TO_SLEEP = 0.5
9
+
10
+ openai.api_key= ""
11
+
12
+ def get_eval(content: str, max_tokens: int):
13
+ while True:
14
+ try:
15
+ response = openai.ChatCompletion.create(
16
+ model='gpt-4-0613',
17
+ messages=[{
18
+ 'role': 'system',
19
+ 'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
20
+ }, {
21
+ 'role': 'user',
22
+ 'content': content,
23
+ }],
24
+ temperature=0.2, # TODO: figure out which temperature is best for evaluation
25
+ max_tokens=max_tokens,
26
+ )
27
+ break
28
+ except openai.error.RateLimitError:
29
+ pass
30
+ except Exception as e:
31
+ print(e)
32
+ time.sleep(NUM_SECONDS_TO_SLEEP)
33
+
34
+ return response['choices'][0]['message']['content']
35
+
36
+
37
+ def parse_score(review):
38
+ try:
39
+ score_pair = review.split('\n')[0]
40
+ score_pair = score_pair.replace(',', ' ')
41
+ sp = score_pair.split(' ')
42
+ if len(sp) == 2:
43
+ return [float(sp[0]), float(sp[1])]
44
+ else:
45
+ print('error', review)
46
+ return [-1, -1]
47
+ except Exception as e:
48
+ print(e)
49
+ print('error', review)
50
+ return [-1, -1]
51
+
52
+
53
+ if __name__ == '__main__':
54
+ parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
55
+ parser.add_argument('-q', '--question')
56
+ parser.add_argument('-c', '--context')
57
+ parser.add_argument('-a', '--answer-list', nargs='+', default=[])
58
+ parser.add_argument('-r', '--rule')
59
+ parser.add_argument('-o', '--output')
60
+ parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
61
+ args = parser.parse_args()
62
+
63
+ f_q = open(os.path.expanduser(args.question))
64
+ f_ans1 = open(os.path.expanduser(args.answer_list[0]))
65
+ f_ans2 = open(os.path.expanduser(args.answer_list[1]))
66
+ rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
67
+
68
+ if os.path.isfile(os.path.expanduser(args.output)):
69
+ cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
70
+ else:
71
+ cur_reviews = []
72
+
73
+ review_file = open(f'{args.output}', 'a')
74
+
75
+ context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
76
+ image_to_context = {context['image']: context for context in context_list}
77
+
78
+ handles = []
79
+ idx = 0
80
+ for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
81
+ ques = json.loads(ques_js)
82
+ ans1 = json.loads(ans1_js)
83
+ ans2 = json.loads(ans2_js)
84
+
85
+ inst = image_to_context[ques['image']]
86
+
87
+ if isinstance(inst['caption'], list):
88
+ cap_str = '\n'.join(inst['caption'])
89
+ else:
90
+ cap_str = inst['caption']
91
+
92
+ category = 'llava_bench_' + json.loads(ques_js)['category']
93
+ if category in rule_dict:
94
+ rule = rule_dict[category]
95
+ else:
96
+ assert False, f"Visual QA category not found in rule file: {category}."
97
+ prompt = rule['prompt']
98
+ role = rule['role']
99
+ content = (f'[Context]\n{cap_str}\n\n'
100
+ f'[Question]\n{ques["text"]}\n\n'
101
+ f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
102
+ f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
103
+ f'[System]\n{prompt}\n\n')
104
+ cur_js = {
105
+ 'id': idx+1,
106
+ 'question_id': ques['question_id'],
107
+ 'answer1_id': ans1.get('answer_id', ans1['question_id']),
108
+ 'answer2_id': ans2.get('answer_id', ans2['answer_id']),
109
+ 'category': category
110
+ }
111
+ if idx >= len(cur_reviews):
112
+ review = get_eval(content, args.max_tokens)
113
+ scores = parse_score(review)
114
+ cur_js['content'] = review
115
+ cur_js['tuple'] = scores
116
+ review_file.write(json.dumps(cur_js) + '\n')
117
+ review_file.flush()
118
+ else:
119
+ print(f'Skipping {idx} as we already have it.')
120
+ idx += 1
121
+ print(idx)
122
+ review_file.close()
eval/llavabench/rule.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "coding": {"role": "Assistant", "prompt": "Your task is to evaluate the coding abilities of the above two assistants. They have been asked to implement a program to solve a given problem. Please review their code submissions, paying close attention to their problem-solving approach, code structure, readability, and the inclusion of helpful comments.\n\nPlease ensure that the assistants' submissions:\n\n1. Correctly implement the given problem statement.\n2. Contain accurate and efficient code.\n3. Include clear and concise comments that explain the code's logic and functionality.\n4. Adhere to proper coding standards and best practices.\n\nOnce you have carefully reviewed both submissions, provide detailed feedback on their strengths and weaknesses, along with any suggestions for improvement. You should first output a single line containing two scores on the scale of 1-10 (1: no code/no sense; 10: perfect) for Assistant 1 and 2, respectively. Then give extra comments starting from the next line."},
3
+ "math": {"role": "Assistant", "prompt": "We would like to request your feedback on the mathematical proficiency of two AI assistants regarding the given user question.\nFirstly, please solve the problem independently, without referring to the answers provided by Assistant 1 and Assistant 2.\nAfterward, please examine the problem-solving process of Assistant 1 and Assistant 2 step-by-step to ensure their correctness, identifying any incorrect steps if present. Your evaluation should take into account not only the answer but also the problem-solving steps.\nFinally, please output a Python tuple containing two numerical scores for Assistant 1 and Assistant 2, ranging from 1 to 10, respectively. If applicable, explain the reasons for any variations in their scores and determine which assistant performed better."},
4
+ "default": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
5
+ "conv": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
6
+ "detail": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
7
+ "complex": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
8
+ "llava_bench_conv": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
9
+ "llava_bench_detail": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
10
+ "llava_bench_complex": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}
11
+ }
eval/llavabench/summarize_gpt_review.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from collections import defaultdict
4
+
5
+ import numpy as np
6
+
7
+ import argparse
8
+
9
+ def parse_args():
10
+ parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
11
+ parser.add_argument('-d', '--dir', default=None)
12
+ parser.add_argument('-v', '--version', default=None)
13
+ parser.add_argument('-s', '--select', nargs='*', default=None)
14
+ parser.add_argument('-f', '--files', nargs='*', default=[])
15
+ parser.add_argument('-i', '--ignore', nargs='*', default=[])
16
+ return parser.parse_args()
17
+
18
+
19
+ if __name__ == '__main__':
20
+ args = parse_args()
21
+
22
+ if args.ignore is not None:
23
+ args.ignore = [int(x) for x in args.ignore]
24
+
25
+ if len(args.files) > 0:
26
+ review_files = args.files
27
+ else:
28
+ review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)]
29
+
30
+ for review_file in sorted(review_files):
31
+ config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
32
+ if args.select is not None and any(x not in config for x in args.select):
33
+ continue
34
+ if '0613' in config:
35
+ version = '0613'
36
+ else:
37
+ version = '0314'
38
+ if args.version is not None and args.version != version:
39
+ continue
40
+ scores = defaultdict(list)
41
+ print(config)
42
+ with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
43
+ for review_str in f:
44
+ review = json.loads(review_str)
45
+ if review['question_id'] in args.ignore:
46
+ continue
47
+ if 'category' in review:
48
+ scores[review['category']].append(review['tuple'])
49
+ scores['all'].append(review['tuple'])
50
+ else:
51
+ if 'tuple' in review:
52
+ scores['all'].append(review['tuple'])
53
+ else:
54
+ scores['all'].append(review['score'])
55
+ for k, v in sorted(scores.items()):
56
+ stats = np.asarray(v).mean(0).tolist()
57
+ stats = [round(x, 3) for x in stats]
58
+ # print(k, stats, round(stats[1]/stats[0]*100, 1))
59
+ print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
60
+ print('=================================')
eval/mathvista/calculate_score.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import argparse
4
+ import pandas as pd
5
+
6
+ # !pip install python-Levenshtein
7
+ from Levenshtein import distance
8
+
9
+ import sys
10
+ sys.path.append('../')
11
+ from utilities import *
12
+
13
+
14
+ def get_most_similar(prediction, choices):
15
+ """
16
+ Use the Levenshtein distance (or edit distance) to determine which of the choices is most similar to the given prediction
17
+ """
18
+ distances = [distance(prediction, choice) for choice in choices]
19
+ ind = distances.index(min(distances))
20
+ return choices[ind]
21
+ # return min(choices, key=lambda choice: distance(prediction, choice))
22
+
23
+
24
+ def normalize_extracted_answer(extraction, choices, question_type, answer_type, precision):
25
+ """
26
+ Normalize the extracted answer to match the answer type
27
+ """
28
+ if question_type == 'multi_choice':
29
+ # make sure the extraction is a string
30
+ if isinstance(extraction, str):
31
+ extraction = extraction.strip()
32
+ else:
33
+ try:
34
+ extraction = str(extraction)
35
+ except:
36
+ extraction = ""
37
+
38
+ # extract "A" from "(A) text"
39
+ letter = re.findall(r'\(([a-zA-Z])\)', extraction)
40
+ if len(letter) > 0:
41
+ extraction = letter[0].upper()
42
+
43
+ options = [chr(ord('A') + i) for i in range(len(choices))]
44
+
45
+ if extraction in options:
46
+ # convert option letter to text, e.g. "A" -> "text"
47
+ ind = options.index(extraction)
48
+ extraction = choices[ind]
49
+ else:
50
+ # select the most similar option
51
+ extraction = get_most_similar(extraction, choices)
52
+ assert extraction in choices
53
+
54
+ elif answer_type == 'integer':
55
+ try:
56
+ extraction = str(int(float(extraction)))
57
+ except:
58
+ extraction = None
59
+
60
+ elif answer_type == 'float':
61
+ try:
62
+ extraction = str(round(float(extraction), precision))
63
+ except:
64
+ extraction = None
65
+
66
+ elif answer_type == 'list':
67
+ try:
68
+ extraction = str(extraction)
69
+ except:
70
+ extraction = None
71
+
72
+ return extraction
73
+
74
+
75
+ def safe_equal(prediction, answer):
76
+ """
77
+ Check if the prediction is equal to the answer, even if they are of different types
78
+ """
79
+ try:
80
+ if prediction == answer:
81
+ return True
82
+ return False
83
+ except Exception as e:
84
+ print(e)
85
+ return False
86
+
87
+
88
+ def get_acc_with_contion(res_pd, key, value):
89
+ if key == 'skills':
90
+ # if value in res_pd[key]:
91
+ total_pd = res_pd[res_pd[key].apply(lambda x: value in x)]
92
+ else:
93
+ total_pd = res_pd[res_pd[key] == value]
94
+
95
+ correct_pd = total_pd[total_pd['true_false'] == True]
96
+ acc = "{:.2f}".format(len(correct_pd) / len(total_pd) * 100)
97
+ return len(correct_pd), len(total_pd), acc
98
+
99
+ if __name__ == '__main__':
100
+ parser = argparse.ArgumentParser()
101
+ parser.add_argument('--output_dir', type=str, default='../results')
102
+ parser.add_argument('--output_file', type=str, default='output.json')
103
+ parser.add_argument('--score_file', type=str, default='scores.json')
104
+ parser.add_argument('--gt_file', type=str, default='../data/testmini.json', help='ground truth file')
105
+ parser.add_argument('--number', type=int, default=-1, help='number of problems to run')
106
+ parser.add_argument('--rerun', action='store_true', help='rerun the evaluation')
107
+ parser.add_argument('--caculate_gain', action='store_true', help='caculate the socre gains over random guess')
108
+ parser.add_argument('--random_file', type=str, default='score_random_guess.json')
109
+ args = parser.parse_args()
110
+
111
+ # args
112
+ output_file = os.path.join(args.output_dir, args.output_file)
113
+
114
+ # # quick test
115
+ # output_file = '../results/llava-llama-2-13b/output_llava_llama_2_13b.json'
116
+
117
+ # read json
118
+ print(f"Reading {output_file}...")
119
+ results = read_json(output_file)
120
+
121
+ # read ground truth
122
+ print(f"Reading {args.gt_file}...")
123
+ gts = read_json(args.gt_file)
124
+
125
+ # full pids
126
+ full_pids = list(results.keys())
127
+ if args.number > 0:
128
+ full_pids = full_pids[:min(args.number, len(full_pids))]
129
+ print("Number of testing problems:", len(full_pids))
130
+
131
+ ## [1] Evaluate if the prediction is true or false
132
+ print("\nEvaluating the predictions...")
133
+ update_json_flag = False
134
+ for pid in full_pids:
135
+ problem = results[pid]
136
+ # print(problem)
137
+
138
+ if args.rerun:
139
+ if 'prediction' in problem:
140
+ del problem['prediction']
141
+ if 'true_false' in problem:
142
+ del problem['true_false']
143
+
144
+ choices = problem['choices']
145
+ question_type = problem['question_type']
146
+ answer_type = problem['answer_type']
147
+ precision = problem['precision']
148
+ extraction = problem['extraction']
149
+
150
+ if 'answer' in problem:
151
+ answer = problem['answer']
152
+ else:
153
+ answer = gts[pid]['answer']
154
+ problem['answer'] = answer
155
+
156
+ # normalize the extracted answer to match the answer type
157
+ prediction = normalize_extracted_answer(extraction, choices, question_type, answer_type, precision)
158
+
159
+ # verify the prediction is true or false
160
+ true_false = safe_equal(prediction, answer)
161
+
162
+ # update the problem
163
+ if "true_false" not in problem:
164
+ update_json_flag = True
165
+
166
+ elif true_false != problem['true_false']:
167
+ update_json_flag = True
168
+
169
+ if "prediction" not in problem:
170
+ update_json_flag = True
171
+
172
+ elif prediction != problem['prediction']:
173
+ update_json_flag = True
174
+
175
+ problem['prediction'] = prediction
176
+ problem['true_false'] = true_false
177
+
178
+ # save the updated json
179
+ if update_json_flag:
180
+ print("\n!!!Some problems are updated.!!!")
181
+ print(f"\nSaving {output_file}...")
182
+ save_json(results, output_file)
183
+
184
+ ## [2] Calculate the average accuracy
185
+ total = len(full_pids)
186
+ correct = 0
187
+ for pid in full_pids:
188
+ if results[pid]['true_false']:
189
+ correct += 1
190
+ accuracy = str(round(correct / total * 100, 2))
191
+ print(f"\nCorrect: {correct}, Total: {total}, Accuracy: {accuracy}%")
192
+
193
+ scores = {"average": {"accuracy": accuracy, "correct": correct, "total": total}}
194
+
195
+ ## [3] Calculate the fine-grained accuracy scores
196
+
197
+ # merge the 'metadata' attribute into the data
198
+ for pid in results:
199
+ results[pid].update(results[pid].pop('metadata'))
200
+
201
+ # convert the data to a pandas DataFrame
202
+ df = pd.DataFrame(results).T
203
+
204
+ print(len(df))
205
+ print("Number of test problems:", len(df))
206
+ # assert len(df) == 1000 # Important!!!
207
+
208
+ # asign the target keys for evaluation
209
+ target_keys = ['question_type', 'answer_type', 'language', 'source', 'category', 'task', 'context', 'grade', 'skills']
210
+
211
+ for key in target_keys:
212
+ print(f"\nType: [{key}]")
213
+ # get the unique values of the key
214
+ if key == 'skills':
215
+ # the value is a list
216
+ values = []
217
+ for i in range(len(df)):
218
+ values += df[key][i]
219
+ values = list(set(values))
220
+ else:
221
+ values = df[key].unique()
222
+ #print(values)
223
+
224
+ # calculate the accuracy for each value
225
+ scores[key] = {}
226
+ for value in values:
227
+ correct, total, acc = get_acc_with_contion(df, key, value)
228
+ if total > 0:
229
+ print(f"[{value}]: {acc}% ({correct}/{total})")
230
+ scores[key][value] = {"accuracy": acc, "correct": correct, "total": total}
231
+
232
+ # sort the scores by accuracy
233
+ scores[key] = dict(sorted(scores[key].items(), key=lambda item: float(item[1]['accuracy']), reverse=True))
234
+
235
+ # save the scores
236
+ scores_file = os.path.join(args.output_dir, args.score_file)
237
+ print(f"\nSaving {scores_file}...")
238
+ save_json(scores, scores_file)
239
+ print("\nDone!")
240
+
241
+ # [4] Calculate the score gains over random guess
242
+ if args.caculate_gain:
243
+ random_file = os.path.join(args.output_dir, args.random_file)
244
+ random_scores = json.load(open(random_file))
245
+
246
+ print("\nCalculating the score gains...")
247
+ for key in scores:
248
+ if key == 'average':
249
+ gain = round(float(scores[key]['accuracy']) - float(random_scores[key]['accuracy']), 2)
250
+ scores[key]['acc_gain'] = gain
251
+ else:
252
+ for sub_key in scores[key]:
253
+ gain = round(float(scores[key][sub_key]['accuracy']) - float(random_scores[key][sub_key]['accuracy']), 2)
254
+ scores[key][sub_key]['acc_gain'] = str(gain)
255
+
256
+ # save the score gains
257
+ print(f"\nSaving {scores_file}...")
258
+ save_json(scores, scores_file)
259
+ print("\nDone!")
eval/mathvista/eval.sh ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # python eval/mathvista/extract_answer.py \
2
+ # --output_dir /mnt/ssd/lbk-cvpr/dataset/eval_results \
3
+ # --output_file Meteor_mathvista_results.json
4
+
5
+ python eval/mathvista/calculate_score.py \
6
+ --output_dir /mnt/ssd/lbk-cvpr/dataset/eval_results \
7
+ --output_file Meteor_mathvista_results_refixed.json \
8
+ --score_file Meteor_mathvista_scores.json \
9
+ --gt_file /mnt/ssd/lbk-cvpr/dataset/MathVista/annot_testmini.json \
eval/mathvista/extract_answer.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import time
4
+ import argparse
5
+
6
+ from tqdm import tqdm
7
+
8
+ import sys
9
+ sys.path.append('../')
10
+ from utilities import *
11
+
12
+ # OpenAI
13
+ import openai
14
+ openai.api_key = ""
15
+ # print(openai.api_key)
16
+
17
+ # load demo prompt
18
+ from prompts.ext_ans import demo_prompt
19
+
20
+
21
+ def verify_extraction(extraction):
22
+ extraction = extraction.strip()
23
+ if extraction == "" or extraction == None:
24
+ return False
25
+ return True
26
+
27
+
28
+ def create_test_prompt(demo_prompt, query, response):
29
+ demo_prompt = demo_prompt.strip()
30
+ test_prompt = f"{query}\n\n{response}"
31
+ full_prompt = f"{demo_prompt}\n\n{test_prompt}\n\nExtracted answer: "
32
+ return full_prompt
33
+
34
+
35
+ def extract_answer(response, problem, quick_extract=False):
36
+ question_type = problem['question_type']
37
+ answer_type = problem['answer_type']
38
+ choices = problem['choices']
39
+ query = problem['query']
40
+ pid = problem['pid']
41
+
42
+ if response == "":
43
+ return ""
44
+
45
+ if question_type == 'multi_choice' and response in choices:
46
+ return response
47
+
48
+ if answer_type == "integer":
49
+ try:
50
+ extraction = int(response)
51
+ return str(extraction)
52
+ except:
53
+ pass
54
+
55
+ if answer_type == "float":
56
+ try:
57
+ extraction = str(float(response))
58
+ return extraction
59
+ except:
60
+ pass
61
+
62
+ # quick extraction
63
+ if quick_extract:
64
+ print("Quickly extracting answer...")
65
+ # The answer is "text". -> "text"
66
+ try:
67
+ result = re.search(r'The answer is "(.*)"\.', response)
68
+ if result:
69
+ extraction = result.group(1)
70
+ return extraction
71
+ except:
72
+ pass
73
+
74
+ # general extraction
75
+ try:
76
+ full_prompt = create_test_prompt(demo_prompt, query, response)
77
+ extraction = get_chat_response(full_prompt, openai.api_key)
78
+ return extraction
79
+ except Exception as e:
80
+ print(e)
81
+ print(f"Error in extracting answer for {pid}")
82
+
83
+ return ""
84
+
85
+
86
+ if __name__ == '__main__':
87
+ parser = argparse.ArgumentParser()
88
+ # input
89
+ parser.add_argument('--output_dir', type=str, default='../results')
90
+ parser.add_argument('--output_file', type=str, default='answer.json')
91
+ parser.add_argument('--response_label', type=str, default='response', help='response label for the input file')
92
+ # model
93
+ parser.add_argument('--llm_engine', type=str, default='gpt-4-0613', help='llm engine',
94
+ choices = ['gpt-3.5-turbo', 'gpt-3.5', 'gpt-4', 'gpt-4-0314', 'gpt-4-0613'])
95
+ parser.add_argument('--number', type=int, default=-1, help='number of problems to run')
96
+ parser.add_argument('--quick_extract', action='store_true', help='use rules to extract answer for some problems')
97
+ parser.add_argument('--rerun', action='store_true', help='rerun the answer extraction')
98
+ # output
99
+ parser.add_argument('--save_every', type=int, default=10, help='save every n problems')
100
+ parser.add_argument('--output_label', type=str, default='', help='label for the output file')
101
+ args = parser.parse_args()
102
+
103
+ # args
104
+ label = args.response_label
105
+ result_file = os.path.join(args.output_dir, args.output_file)
106
+
107
+ if args.output_label != '':
108
+ output_file = result_file.replace('.json', f'_{args.output_label}.json')
109
+ else:
110
+ output_file = result_file
111
+
112
+ # read results
113
+ print(f"Reading {result_file}...")
114
+ results = read_json(result_file)
115
+
116
+ # full pids
117
+ full_pids = list(results.keys())
118
+ if args.number > 0:
119
+ full_pids = full_pids[:min(args.number, len(full_pids))]
120
+ print("Number of testing problems:", len(full_pids))
121
+
122
+ # test pids
123
+ if args.rerun:
124
+ test_pids = full_pids
125
+ else:
126
+ test_pids = []
127
+ for pid in full_pids:
128
+ # print(pid)
129
+ if 'extraction' not in results[pid] or not verify_extraction(results[pid]['extraction']):
130
+ test_pids.append(pid)
131
+
132
+ test_num = len(test_pids)
133
+ print("Number of problems to run:", test_num)
134
+ # print(test_pids)
135
+
136
+ # tqdm, enumerate results
137
+ for i, pid in enumerate(tqdm(test_pids)):
138
+ problem = results[pid]
139
+
140
+ assert label in problem
141
+ response = problem[label]
142
+
143
+
144
+ extraction = extract_answer(response, problem, args.quick_extract)
145
+ results[pid]['extraction'] = extraction
146
+
147
+ if i % args.save_every == 0 or i == test_num - 1:
148
+ print(f"Saving results to {output_file}...")
149
+ save_json(results, output_file)
150
+ print(f"Results saved.")
eval/mathvista/prompts/ext_ans.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ # pids = 852, 104, 824, 506, 540
4
+
5
+ demo_prompt = """
6
+ Please read the following example. Then extract the answer from the model response and type it at the end of the prompt.
7
+
8
+ Hint: Please answer the question requiring an integer answer and provide the final value, e.g., 1, 2, 3, at the end.
9
+ Question: Which number is missing?
10
+
11
+ Model response: The number missing in the sequence is 14.
12
+
13
+ Extracted answer: 14
14
+
15
+ Hint: Please answer the question requiring a floating-point number with one decimal place and provide the final value, e.g., 1.2, 1.3, 1.4, at the end.
16
+ Question: What is the fraction of females facing the camera?
17
+
18
+ Model response: The fraction of females facing the camera is 0.6, which means that six out of ten females in the group are facing the camera.
19
+
20
+ Extracted answer: 0.6
21
+
22
+ Hint: Please answer the question requiring a floating-point number with two decimal places and provide the final value, e.g., 1.23, 1.34, 1.45, at the end.
23
+ Question: How much money does Luca need to buy a sour apple candy and a butterscotch candy? (Unit: $)
24
+
25
+ Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.
26
+
27
+ Extracted answer: 1.45
28
+
29
+ Hint: Please answer the question requiring a Python list as an answer and provide the final list, e.g., [1, 2, 3], [1.2, 1.3, 1.4], at the end.
30
+ Question: Between which two years does the line graph saw its maximum peak?
31
+
32
+ Model response: The line graph saw its maximum peak between 2007 and 2008.
33
+
34
+ Extracted answer: [2007, 2008]
35
+
36
+ Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.
37
+ Question: What fraction of the shape is blue?\nChoices:\n(A) 3/11\n(B) 8/11\n(C) 6/11\n(D) 3/5
38
+
39
+ Model response: The correct answer is (B) 8/11.
40
+
41
+ Extracted answer: B
42
+ """
eval/mathvista/utilities.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import json
4
+ import time
5
+ import pickle
6
+ import openai
7
+ import re
8
+ from word2number import w2n
9
+
10
+
11
+ def create_dir(output_dir):
12
+ if not os.path.exists(output_dir):
13
+ os.makedirs(output_dir)
14
+
15
+
16
+ def read_csv(file):
17
+ data = []
18
+ with open(file, 'r') as f:
19
+ for line in f:
20
+ data.append(line.strip())
21
+ return data
22
+
23
+
24
+ def read_pandas_csv(csv_path):
25
+ # read a pandas csv sheet
26
+ import pandas as pd
27
+ df = pd.read_csv(csv_path)
28
+ return df
29
+
30
+
31
+ def read_json(path):
32
+ with open(path, 'r', encoding='utf-8') as f:
33
+ return json.load(f)
34
+
35
+
36
+ def read_jsonl(file):
37
+ with open(file, 'r') as f:
38
+ data = [json.loads(line) for line in f]
39
+ return data
40
+
41
+
42
+ def read_pickle(path):
43
+ with open(path, 'rb') as f:
44
+ return pickle.load(f)
45
+
46
+
47
+ def save_json(data, path):
48
+ with open(path, 'w') as f:
49
+ json.dump(data, f, indent=4)
50
+
51
+
52
+ def save_array_img(path, image):
53
+ cv2.imwrite(path, image)
54
+
55
+
56
+ def contains_digit(text):
57
+ # check if text contains a digit
58
+ if any(char.isdigit() for char in text):
59
+ return True
60
+ return False
61
+
62
+ def contains_number_word(text):
63
+ # check if text contains a number word
64
+ ignore_words = ["a", "an", "point"]
65
+ words = re.findall(r'\b\w+\b', text) # This regex pattern matches any word in the text
66
+ for word in words:
67
+ if word in ignore_words:
68
+ continue
69
+ try:
70
+ w2n.word_to_num(word)
71
+ return True # If the word can be converted to a number, return True
72
+ except ValueError:
73
+ continue # If the word can't be converted to a number, continue with the next word
74
+
75
+ # check if text contains a digit
76
+ if any(char.isdigit() for char in text):
77
+ return True
78
+
79
+ return False # If none of the words could be converted to a number, return False
80
+
81
+
82
+ def contains_quantity_word(text, special_keep_words=[]):
83
+ # check if text contains a quantity word
84
+ quantity_words = ["most", "least", "fewest"
85
+ "more", "less", "fewer",
86
+ "largest", "smallest", "greatest",
87
+ "larger", "smaller", "greater",
88
+ "highest", "lowest", "higher", "lower",
89
+ "increase", "decrease",
90
+ "minimum", "maximum", "max", "min",
91
+ "mean", "average", "median",
92
+ "total", "sum", "add", "subtract",
93
+ "difference", "quotient", "gap",
94
+ "half", "double", "twice", "triple",
95
+ "square", "cube", "root",
96
+ "approximate", "approximation",
97
+ "triangle", "rectangle", "circle", "square", "cube", "sphere", "cylinder", "cone", "pyramid",
98
+ "multiply", "divide",
99
+ "percentage", "percent", "ratio", "proportion", "fraction", "rate",
100
+ ]
101
+
102
+ quantity_words += special_keep_words # dataset specific words
103
+
104
+ words = re.findall(r'\b\w+\b', text) # This regex pattern matches any word in the text
105
+ if any(word in quantity_words for word in words):
106
+ return True
107
+
108
+ return False # If none of the words could be converted to a number, return False
109
+
110
+
111
+ def is_bool_word(text):
112
+ if text in ["Yes", "No", "True", "False",
113
+ "yes", "no", "true", "false",
114
+ "YES", "NO", "TRUE", "FALSE"]:
115
+ return True
116
+ return False
117
+
118
+
119
+ def is_digit_string(text):
120
+ # remove ".0000"
121
+ text = text.strip()
122
+ text = re.sub(r'\.0+$', '', text)
123
+ try:
124
+ int(text)
125
+ return True
126
+ except ValueError:
127
+ return False
128
+
129
+
130
+ def is_float_string(text):
131
+ # text is a float string if it contains a "." and can be converted to a float
132
+ if "." in text:
133
+ try:
134
+ float(text)
135
+ return True
136
+ except ValueError:
137
+ return False
138
+ return False
139
+
140
+
141
+ def copy_image(image_path, output_image_path):
142
+ from shutil import copyfile
143
+ copyfile(image_path, output_image_path)
144
+
145
+
146
+ def copy_dir(src_dir, dst_dir):
147
+ from shutil import copytree
148
+ # copy the source directory to the target directory
149
+ copytree(src_dir, dst_dir)
150
+
151
+
152
+ import PIL.Image as Image
153
+ def get_image_size(img_path):
154
+ img = Image.open(img_path)
155
+ width, height = img.size
156
+ return width, height
157
+
158
+
159
+ def get_chat_response(promot, api_key, model="gpt-3.5-turbo", temperature=0, max_tokens=256, n=1, patience=10000000,
160
+ sleep_time=0):
161
+ messages = [
162
+ {"role": "user", "content": promot},
163
+ ]
164
+ # print("I am here")
165
+ while patience > 0:
166
+ patience -= 1
167
+ try:
168
+ response = openai.ChatCompletion.create(model=model,
169
+ messages=messages,
170
+ api_key=api_key,
171
+ temperature=temperature,
172
+ max_tokens=max_tokens,
173
+ n=n)
174
+ if n == 1:
175
+ prediction = response['choices'][0]['message']['content'].strip()
176
+ if prediction != "" and prediction != None:
177
+ return prediction
178
+ else:
179
+ prediction = [choice['message']['content'].strip() for choice in response['choices']]
180
+ if prediction[0] != "" and prediction[0] != None:
181
+ return prediction
182
+
183
+ except Exception as e:
184
+ if "Rate limit" not in str(e):
185
+ print(e)
186
+
187
+ if "Please reduce the length of the messages" in str(e):
188
+ print("!!Reduce promot size")
189
+ # reduce input prompt and keep the tail
190
+ new_size = int(len(promot) * 0.9)
191
+ new_start = len(promot) - new_size
192
+ promot = promot[new_start:]
193
+ messages = [
194
+ {"role": "user", "content": promot},
195
+ ]
196
+
197
+ if sleep_time > 0:
198
+ time.sleep(sleep_time)
199
+ return ""
eval/mm-vet/eval.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python eval/mm-vet/evaluate_mmvet.py
eval/mm-vet/evaluate_mmvet.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import json
3
+
4
+ import os
5
+ from tqdm import tqdm
6
+ import pandas as pd
7
+ import numpy as np
8
+ from collections import Counter
9
+ import time
10
+
11
+ gpt_model = "gpt-4-0613"
12
+ openai.api_key= ""
13
+ DATASET_ROOT=""
14
+ MMVET = "mm-vet/mm-vet.json"
15
+
16
+ prompt = """Compare the ground truth and prediction from AI models, to give a correctness score for the prediction. <AND> in the ground truth means it is totally right only when all elements in the ground truth are present in the prediction, and <OR> means it is totally right when any one element in the ground truth is present in the prediction. The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). Just complete the last space of the correctness score.
17
+
18
+ Question | Ground truth | Prediction | Correctness
19
+ --- | --- | --- | ---
20
+ What is x in the equation? | -1 <AND> -5 | x = 3 | 0.0
21
+ What is x in the equation? | -1 <AND> -5 | x = -1 | 0.5
22
+ What is x in the equation? | -1 <AND> -5 | x = -5 | 0.5
23
+ What is x in the equation? | -1 <AND> -5 | x = -5 or 5 | 0.5
24
+ What is x in the equation? | -1 <AND> -5 | x = -1 or x = -5 | 1.0
25
+ Can you explain this meme? | This meme is poking fun at the fact that the names of the countries Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes, while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues because the names of these countries do not accurately represent their landscapes. | The meme talks about Iceland and Greenland. It's pointing out that despite their names, Iceland is not very icy and Greenland isn't very green. | 0.4
26
+ Can you explain this meme? | This meme is poking fun at the fact that the names of the countries Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes, while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues because the names of these countries do not accurately represent their landscapes. | The meme is using humor to point out the misleading nature of Iceland's and Greenland's names. Iceland, despite its name, has lush green landscapes while Greenland is mostly covered in ice and snow. The text 'This is why I have trust issues' is a playful way to suggest that these contradictions can lead to distrust or confusion. The humor in this meme is derived from the unexpected contrast between the names of the countries and their actual physical characteristics. | 1.0
27
+ """
28
+
29
+ # load metadata
30
+ decimal_places = 1 # number of decimal places to round to
31
+
32
+
33
+ sub_set = None
34
+ sub_set_name = ''
35
+
36
+ mmvet_metadata = os.path.join(DATASET_ROOT, MMVET)
37
+ with open(mmvet_metadata, 'r') as f:
38
+ data = json.load(f)
39
+
40
+
41
+ counter = Counter()
42
+ cap_set_list = []
43
+ cap_set_counter = []
44
+ len_data = 0
45
+ for id, value in data.items():
46
+ if sub_set is not None and id not in sub_set:
47
+ continue
48
+ question = value["question"]
49
+ answer = value["answer"]
50
+ cap = value["capability"]
51
+ cap = set(cap)
52
+ counter.update(cap)
53
+ if cap not in cap_set_list:
54
+ cap_set_list.append(cap)
55
+ cap_set_counter.append(1)
56
+ else:
57
+ cap_set_counter[cap_set_list.index(cap)] += 1
58
+
59
+ len_data += 1
60
+
61
+ sorted_list = counter.most_common()
62
+ columns = [k for k, v in sorted_list]
63
+ columns.append("total")
64
+ columns.append("std")
65
+ columns.append('runs')
66
+ df = pd.DataFrame(columns=columns)
67
+
68
+
69
+ cap_set_sorted_indices = np.argsort(-np.array(cap_set_counter))
70
+ new_cap_set_list = []
71
+ new_cap_set_counter = []
72
+ for index in cap_set_sorted_indices:
73
+ new_cap_set_list.append(cap_set_list[index])
74
+ new_cap_set_counter.append(cap_set_counter[index])
75
+
76
+ cap_set_list = new_cap_set_list
77
+ cap_set_counter = new_cap_set_counter
78
+ cap_set_names = ["_".join(list(cap_set)) for cap_set in cap_set_list]
79
+
80
+ columns2 = cap_set_names
81
+ columns2.append("total")
82
+ columns2.append("std")
83
+ columns2.append('runs')
84
+ df2 = pd.DataFrame(columns=columns2)
85
+
86
+ ###### change your model name ######
87
+ model = "Meteor"
88
+ result_path = os.path.join(DATASET_ROOT, "eval_results")
89
+ num_run = 1 # we set it as 5 in the paper
90
+ model_results_file = os.path.join(result_path, f"{model}_mmvet_results.json")
91
+
92
+ # grade results for each sample to svae
93
+ grade_file = f'{model}_{gpt_model}-grade-{num_run}runs.json'
94
+ grade_file = os.path.join(result_path, grade_file)
95
+
96
+ # score results regarding capabilities/capability integration to save
97
+ cap_score_file = f'{model}_{sub_set_name}{gpt_model}-cap-score-{num_run}runs.csv'
98
+ cap_score_file = os.path.join(result_path, cap_score_file)
99
+ cap_int_score_file = f'{model}_{sub_set_name}{gpt_model}-cap-int-score-{num_run}runs.csv'
100
+ cap_int_score_file = os.path.join(result_path, cap_int_score_file)
101
+
102
+ with open(model_results_file) as f:
103
+ results = json.load(f)
104
+ if os.path.exists(grade_file):
105
+ with open(grade_file, 'r') as f:
106
+ grade_results = json.load(f)
107
+ else:
108
+ grade_results = {}
109
+
110
+
111
+ def need_more_runs():
112
+ need_more_runs = False
113
+ if len(grade_results) > 0:
114
+ for k, v in grade_results.items():
115
+ if len(v['score']) < num_run:
116
+ need_more_runs = True
117
+ break
118
+ return need_more_runs or len(grade_results) < len_data
119
+
120
+
121
+ while need_more_runs():
122
+ for j in range(num_run):
123
+ print(f'eval run {j}')
124
+ for id, line in tqdm(data.items()):
125
+ if sub_set is not None and id not in sub_set:
126
+ continue
127
+ if id in grade_results and len(grade_results[id]['score']) >= (j + 1):
128
+ continue
129
+
130
+ model_pred = results[id]
131
+
132
+ question = prompt + '\n' + ' | '.join([line['question'], line['answer'].replace("<AND>", " <AND> ").replace("<OR>", " <OR> "), model_pred, ""])
133
+ messages = [
134
+ {"role": "user", "content": question},
135
+ ]
136
+
137
+ if id not in grade_results:
138
+ sample_grade = {'model': [], 'content': [], 'score': []}
139
+ else:
140
+ sample_grade = grade_results[id]
141
+
142
+
143
+ grade_sample_run_complete = False
144
+ temperature = 0.0
145
+
146
+ while not grade_sample_run_complete:
147
+ try:
148
+ response = openai.ChatCompletion.create(
149
+ model=gpt_model,
150
+ max_tokens=3,
151
+ temperature=temperature,
152
+ messages=messages)
153
+ content = response['choices'][0]['message']['content']
154
+ flag = True
155
+ try_time = 1
156
+ while flag:
157
+ try:
158
+ content = content.split(' ')[0].strip()
159
+ score = float(content)
160
+ if score > 1.0 or score < 0.0:
161
+ assert False
162
+ flag = False
163
+ except:
164
+ question = prompt + '\n' + ' | '.join([line['question'], line['answer'].replace("<AND>", " <AND> ").replace("<OR>", " <OR> "), model_pred, ""]) + "\nPredict the correctness of the answer (digit): "
165
+ messages = [
166
+ {"role": "user", "content": question},
167
+ ]
168
+ response = openai.ChatCompletion.create(
169
+ model=gpt_model,
170
+ max_tokens=3,
171
+ temperature=temperature,
172
+ messages=messages)
173
+ content = response['choices'][0]['message']['content']
174
+ try_time += 1
175
+ temperature += 0.5
176
+ print(f"{id} try {try_time} times")
177
+ print(content)
178
+ if try_time > 5:
179
+ score = 0.0
180
+ flag = False
181
+ grade_sample_run_complete = True
182
+ except:
183
+ # gpt4 may have token rate limit
184
+ print("sleep 30s")
185
+ time.sleep(30)
186
+
187
+ if len(sample_grade['model']) >= j + 1:
188
+ sample_grade['model'][j] = response['model']
189
+ sample_grade['content'][j] = content
190
+ sample_grade['score'][j] = score
191
+ else:
192
+ sample_grade['model'].append(response['model'])
193
+ sample_grade['content'].append(content)
194
+ sample_grade['score'].append(score)
195
+ grade_results[id] = sample_grade
196
+
197
+ with open(grade_file, 'w') as f:
198
+ json.dump(grade_results, f, indent=4)
199
+
200
+
201
+ assert not need_more_runs()
202
+ cap_socres = {k: [0.0]*num_run for k in columns[:-2]}
203
+ counter['total'] = len_data
204
+
205
+ cap_socres2 = {k: [0.0]*num_run for k in columns2[:-2]}
206
+ counter2 = {columns2[i]:cap_set_counter[i] for i in range(len(cap_set_counter))}
207
+ counter2['total'] = len_data
208
+
209
+ for k, v in grade_results.items():
210
+ if sub_set is not None and k not in sub_set:
211
+ continue
212
+ for i in range(num_run):
213
+ score = v['score'][i]
214
+ caps = set(data[k]['capability'])
215
+ for c in caps:
216
+ cap_socres[c][i] += score
217
+
218
+ cap_socres['total'][i] += score
219
+
220
+ index = cap_set_list.index(caps)
221
+ cap_socres2[cap_set_names[index]][i] += score
222
+ cap_socres2['total'][i] += score
223
+
224
+ for k, v in cap_socres.items():
225
+ cap_socres[k] = np.array(v) / counter[k] *100
226
+
227
+
228
+ std = round(cap_socres['total'].std(), decimal_places)
229
+ total_copy = cap_socres['total'].copy()
230
+ runs = str(list(np.round(total_copy, decimal_places)))
231
+
232
+ for k, v in cap_socres.items():
233
+ cap_socres[k] = round(v.mean(), decimal_places)
234
+
235
+ cap_socres['std'] = std
236
+ cap_socres['runs'] = runs
237
+ df.loc[model] = cap_socres
238
+
239
+
240
+ for k, v in cap_socres2.items():
241
+ cap_socres2[k] = round(np.mean(np.array(v) / counter2[k] *100), decimal_places)
242
+ cap_socres2['std'] = std
243
+ cap_socres2['runs'] = runs
244
+ df2.loc[model] = cap_socres2
245
+
246
+ df.to_csv(cap_score_file)
247
+ df2.to_csv(cap_int_score_file)
248
+
249
+ print(df)
250
+ print(df2)
eval/utils.py ADDED
@@ -0,0 +1,1351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import openai
5
+ from typing import Dict
6
+ from tqdm import tqdm
7
+ import random
8
+ import numpy as np
9
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
10
+ from typing import Optional
11
+ from collections import defaultdict
12
+ from eval.mathvista.utilities import get_chat_response
13
+ from config import *
14
+ from copy import deepcopy
15
+
16
+ random.seed(42)
17
+
18
+ # SEED Question types
19
+ SEED_TYPES = {1: 'Scene Understanding', 2: 'Instance Identity', 3: 'Instance Location', 4: 'Instance Attributes', 5: 'Instances Counting', 6: 'Spatial Relation', 7: 'Instance Interaction', 8: 'Visual Reasoning', 9: 'Text Understanding'}
20
+
21
+ # Check for duplicated questions, items
22
+ def remove_duplicate(dataset, inputs, gen_answers):
23
+ if dataset == "mme":
24
+ return inputs, gen_answers
25
+ elif dataset == "pope":
26
+ questions = set()
27
+ new_inputs, new_answers = [], []
28
+ for i, a in zip(inputs, gen_answers):
29
+ dup = i['id'], i['category']
30
+ if dup in questions:
31
+ continue
32
+ questions.add(dup)
33
+ new_inputs.append(i)
34
+ new_answers.append(a)
35
+ else:
36
+ questions = set()
37
+ new_inputs, new_answers = [], []
38
+ for i, a in zip(inputs, gen_answers):
39
+ if i['id'] in questions:
40
+ continue
41
+ questions.add(i['id'])
42
+ new_inputs.append(i)
43
+ new_answers.append(a)
44
+ return new_inputs, new_answers
45
+
46
+ class EvalAIAnswerProcessor:
47
+ """
48
+ Processes an answer similar to Eval AI
49
+ copied from
50
+ https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897
51
+ """
52
+
53
+ CONTRACTIONS = {
54
+ "aint": "ain't",
55
+ "arent": "aren't",
56
+ "cant": "can't",
57
+ "couldve": "could've",
58
+ "couldnt": "couldn't",
59
+ "couldn'tve": "couldn't've",
60
+ "couldnt've": "couldn't've",
61
+ "didnt": "didn't",
62
+ "doesnt": "doesn't",
63
+ "dont": "don't",
64
+ "hadnt": "hadn't",
65
+ "hadnt've": "hadn't've",
66
+ "hadn'tve": "hadn't've",
67
+ "hasnt": "hasn't",
68
+ "havent": "haven't",
69
+ "hed": "he'd",
70
+ "hed've": "he'd've",
71
+ "he'dve": "he'd've",
72
+ "hes": "he's",
73
+ "howd": "how'd",
74
+ "howll": "how'll",
75
+ "hows": "how's",
76
+ "Id've": "I'd've",
77
+ "I'dve": "I'd've",
78
+ "Im": "I'm",
79
+ "Ive": "I've",
80
+ "isnt": "isn't",
81
+ "itd": "it'd",
82
+ "itd've": "it'd've",
83
+ "it'dve": "it'd've",
84
+ "itll": "it'll",
85
+ "let's": "let's",
86
+ "maam": "ma'am",
87
+ "mightnt": "mightn't",
88
+ "mightnt've": "mightn't've",
89
+ "mightn'tve": "mightn't've",
90
+ "mightve": "might've",
91
+ "mustnt": "mustn't",
92
+ "mustve": "must've",
93
+ "neednt": "needn't",
94
+ "notve": "not've",
95
+ "oclock": "o'clock",
96
+ "oughtnt": "oughtn't",
97
+ "ow's'at": "'ow's'at",
98
+ "'ows'at": "'ow's'at",
99
+ "'ow'sat": "'ow's'at",
100
+ "shant": "shan't",
101
+ "shed've": "she'd've",
102
+ "she'dve": "she'd've",
103
+ "she's": "she's",
104
+ "shouldve": "should've",
105
+ "shouldnt": "shouldn't",
106
+ "shouldnt've": "shouldn't've",
107
+ "shouldn'tve": "shouldn't've",
108
+ "somebody'd": "somebodyd",
109
+ "somebodyd've": "somebody'd've",
110
+ "somebody'dve": "somebody'd've",
111
+ "somebodyll": "somebody'll",
112
+ "somebodys": "somebody's",
113
+ "someoned": "someone'd",
114
+ "someoned've": "someone'd've",
115
+ "someone'dve": "someone'd've",
116
+ "someonell": "someone'll",
117
+ "someones": "someone's",
118
+ "somethingd": "something'd",
119
+ "somethingd've": "something'd've",
120
+ "something'dve": "something'd've",
121
+ "somethingll": "something'll",
122
+ "thats": "that's",
123
+ "thered": "there'd",
124
+ "thered've": "there'd've",
125
+ "there'dve": "there'd've",
126
+ "therere": "there're",
127
+ "theres": "there's",
128
+ "theyd": "they'd",
129
+ "theyd've": "they'd've",
130
+ "they'dve": "they'd've",
131
+ "theyll": "they'll",
132
+ "theyre": "they're",
133
+ "theyve": "they've",
134
+ "twas": "'twas",
135
+ "wasnt": "wasn't",
136
+ "wed've": "we'd've",
137
+ "we'dve": "we'd've",
138
+ "weve": "we've",
139
+ "werent": "weren't",
140
+ "whatll": "what'll",
141
+ "whatre": "what're",
142
+ "whats": "what's",
143
+ "whatve": "what've",
144
+ "whens": "when's",
145
+ "whered": "where'd",
146
+ "wheres": "where's",
147
+ "whereve": "where've",
148
+ "whod": "who'd",
149
+ "whod've": "who'd've",
150
+ "who'dve": "who'd've",
151
+ "wholl": "who'll",
152
+ "whos": "who's",
153
+ "whove": "who've",
154
+ "whyll": "why'll",
155
+ "whyre": "why're",
156
+ "whys": "why's",
157
+ "wont": "won't",
158
+ "wouldve": "would've",
159
+ "wouldnt": "wouldn't",
160
+ "wouldnt've": "wouldn't've",
161
+ "wouldn'tve": "wouldn't've",
162
+ "yall": "y'all",
163
+ "yall'll": "y'all'll",
164
+ "y'allll": "y'all'll",
165
+ "yall'd've": "y'all'd've",
166
+ "y'alld've": "y'all'd've",
167
+ "y'all'dve": "y'all'd've",
168
+ "youd": "you'd",
169
+ "youd've": "you'd've",
170
+ "you'dve": "you'd've",
171
+ "youll": "you'll",
172
+ "youre": "you're",
173
+ "youve": "you've",
174
+ }
175
+
176
+ NUMBER_MAP = {
177
+ "none": "0",
178
+ "zero": "0",
179
+ "one": "1",
180
+ "two": "2",
181
+ "three": "3",
182
+ "four": "4",
183
+ "five": "5",
184
+ "six": "6",
185
+ "seven": "7",
186
+ "eight": "8",
187
+ "nine": "9",
188
+ "ten": "10",
189
+ }
190
+ ARTICLES = ["a", "an", "the"]
191
+ PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)")
192
+ COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)")
193
+ PUNCTUATIONS = [
194
+ ";",
195
+ r"/",
196
+ "[",
197
+ "]",
198
+ '"',
199
+ "{",
200
+ "}",
201
+ "(",
202
+ ")",
203
+ "=",
204
+ "+",
205
+ "\\",
206
+ "_",
207
+ "-",
208
+ ">",
209
+ "<",
210
+ "@",
211
+ "`",
212
+ ",",
213
+ "?",
214
+ "!",
215
+ ]
216
+
217
+ def __init__(self, *args, **kwargs):
218
+ pass
219
+
220
+ def word_tokenize(self, word):
221
+ word = word.lower()
222
+ word = word.replace(",", "").replace("?", "").replace("'s", " 's")
223
+ return word.strip()
224
+
225
+ def process_punctuation(self, in_text):
226
+ out_text = in_text
227
+ for p in self.PUNCTUATIONS:
228
+ if (p + " " in in_text or " " + p in in_text) or (
229
+ re.search(self.COMMA_STRIP, in_text) is not None
230
+ ):
231
+ out_text = out_text.replace(p, "")
232
+ else:
233
+ out_text = out_text.replace(p, " ")
234
+ out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE)
235
+ return out_text
236
+
237
+ def process_digit_article(self, in_text):
238
+ out_text = []
239
+ temp_text = in_text.lower().split()
240
+ for word in temp_text:
241
+ word = self.NUMBER_MAP.setdefault(word, word)
242
+ if word not in self.ARTICLES:
243
+ out_text.append(word)
244
+ else:
245
+ pass
246
+ for word_id, word in enumerate(out_text):
247
+ if word in self.CONTRACTIONS:
248
+ out_text[word_id] = self.CONTRACTIONS[word]
249
+ out_text = " ".join(out_text)
250
+ return out_text
251
+
252
+ def __call__(self, item):
253
+ item = self.word_tokenize(item)
254
+ item = item.replace("\n", " ").replace("\t", " ").strip()
255
+ item = self.process_punctuation(item)
256
+ item = self.process_digit_article(item)
257
+ return item
258
+
259
+
260
+ class TextVQAAccuracyEvaluator:
261
+ def __init__(self):
262
+ self.answer_processor = EvalAIAnswerProcessor()
263
+
264
+ def _compute_answer_scores(self, raw_answers):
265
+ """
266
+ compute the accuracy (soft score) of human answers
267
+ """
268
+ answers = [self.answer_processor(a) for a in raw_answers]
269
+ assert len(answers) == 10
270
+ gt_answers = list(enumerate(answers))
271
+ unique_answers = set(answers)
272
+ unique_answer_scores = {}
273
+
274
+ for unique_answer in unique_answers:
275
+ accs = []
276
+ for gt_answer in gt_answers:
277
+ other_answers = [item for item in gt_answers if item != gt_answer]
278
+ matching_answers = [
279
+ item for item in other_answers if item[1] == unique_answer
280
+ ]
281
+ acc = min(1, float(len(matching_answers)) / 3)
282
+ accs.append(acc)
283
+ unique_answer_scores[unique_answer] = sum(accs) / len(accs)
284
+
285
+ return unique_answer_scores
286
+
287
+ def eval_pred_list(self, pred_list):
288
+ pred_scores = []
289
+ for entry in pred_list:
290
+ pred_answer = self.answer_processor(entry["pred_answer"])
291
+ unique_answer_scores = self._compute_answer_scores(entry["gt_answers"])
292
+ score = unique_answer_scores.get(pred_answer, 0.0)
293
+ pred_scores.append(score)
294
+
295
+ accuracy = sum(pred_scores) / len(pred_scores)
296
+ return accuracy
297
+
298
+ # MME
299
+ class MMEEvaluator:
300
+ def divide_chunks(self, l, n=2):
301
+ # looping till length l
302
+ for i in range(0, len(l), n):
303
+ yield l[i:i + n]
304
+
305
+ return
306
+
307
+ def parse_pred_ans(self, pred_ans):
308
+ pred_label = None
309
+ if pred_ans in ["yes", "no"]:
310
+ pred_label = pred_ans
311
+ else:
312
+ prefix_pred_ans = pred_ans[:4]
313
+
314
+ if "yes" in prefix_pred_ans:
315
+ pred_label = "yes"
316
+ elif "no" in prefix_pred_ans:
317
+ pred_label = "no"
318
+ else:
319
+ pred_label = "other"
320
+
321
+ return pred_label
322
+
323
+
324
+ def compute_metric(self, gts, preds):
325
+ assert len(gts) == len(preds)
326
+
327
+ label_map = {
328
+ "yes": 1,
329
+ "no": 0,
330
+ "other": -1,
331
+ }
332
+
333
+ gts = [label_map[x] for x in gts]
334
+ preds = [label_map[x] for x in preds]
335
+
336
+ acc = accuracy_score(gts, preds)
337
+
338
+ clean_gts = []
339
+ clean_preds = []
340
+ other_num = 0
341
+ for gt, pred in zip(gts, preds):
342
+ if pred == -1:
343
+ other_num += 1
344
+ continue
345
+ clean_gts.append(gt)
346
+ clean_preds.append(pred)
347
+
348
+
349
+ conf_mat = confusion_matrix(clean_gts, clean_preds, labels=[1,0])
350
+ precision = precision_score(clean_gts, clean_preds, average='binary')
351
+ recall = recall_score(clean_gts, clean_preds, average='binary')
352
+ tp, fn = conf_mat[0]
353
+ fp, tn = conf_mat[1]
354
+
355
+ metric_dict = dict()
356
+ metric_dict = {
357
+ "TP": tp,
358
+ "FN": fn,
359
+ "TN": tn,
360
+ "FP": fp,
361
+ "precision": precision,
362
+ "recall": recall,
363
+ "other_num": other_num,
364
+ "acc": acc,
365
+ }
366
+
367
+ return metric_dict
368
+
369
+
370
+ def process_result(self, results_dir):
371
+ eval_type_dict = {
372
+ "Perception": ["existence", "count", "position", "color", "posters", "celebrity", "scene", "landmark", "artwork", "OCR"],
373
+ "Cognition": ["commonsense_reasoning", "numerical_calculation", "text_translation", "code_reasoning"]
374
+ }
375
+
376
+ model_score_dict = dict()
377
+ for eval_type, task_name_list in eval_type_dict.items():
378
+
379
+ scores = 0
380
+ task_score_dict = dict()
381
+
382
+ for task_name in task_name_list:
383
+ if not os.path.exists(results_dir):
384
+ os.makedirs(results_dir)
385
+ task_txt = os.path.join(results_dir, task_name + ".txt")
386
+ lines = open(task_txt, 'r').readlines()
387
+ chunk_lines = list(self.divide_chunks(lines)) # one image corresponds to two questions
388
+
389
+ img_num = len(chunk_lines)
390
+ task_other_ans_num = 0
391
+ task_score = 0
392
+ acc_plus_correct_num = 0
393
+ gts = []
394
+ preds = []
395
+
396
+ for img_items in chunk_lines:
397
+ assert len(img_items) == 2
398
+ img_correct_num = 0
399
+
400
+ for img_item in img_items:
401
+ img_name, question, gt_ans, pred_ans = img_item.split("\t")
402
+
403
+ gt_ans = gt_ans.lower()
404
+ pred_ans = pred_ans.lower()
405
+
406
+ assert gt_ans in ["yes", "no"] # gt can only be yes or no.
407
+
408
+ pred_ans = self.parse_pred_ans(pred_ans)
409
+ assert pred_ans in ["yes", "no", "other"]
410
+
411
+ gts.append(gt_ans)
412
+ preds.append(pred_ans)
413
+
414
+ if gt_ans == pred_ans:
415
+ img_correct_num += 1
416
+
417
+ if pred_ans not in ["yes", "no"]:
418
+ task_other_ans_num += 1
419
+
420
+ if img_correct_num == 2:
421
+ acc_plus_correct_num += 1
422
+
423
+ # cal TP precision acc, etc.
424
+ metric_dict = self.compute_metric(gts, preds)
425
+ acc_plus = acc_plus_correct_num / img_num
426
+ metric_dict["acc_plus"] = acc_plus
427
+
428
+
429
+ for k, v in metric_dict.items():
430
+ if k in ["acc", "acc_plus"]:
431
+ task_score += v*100
432
+
433
+ task_score_dict[task_name] = task_score
434
+
435
+ scores += task_score
436
+ task_score_dict['total'] = scores
437
+ model_score_dict[eval_type] = task_score_dict
438
+ return model_score_dict
439
+
440
+ # For MMMU, convert all <image #> tokens to <image>
441
+ def replace_image_tokens(question):
442
+ replaced = set()
443
+ def replace_token(match):
444
+ token = match.group(0)
445
+ if token not in replaced:
446
+ replaced.add(token)
447
+ return '<image>'
448
+ return token
449
+
450
+ pattern = re.compile(r'<image\s\d+>')
451
+ return pattern.sub(replace_token, question)
452
+
453
+ # For MMMU, count all <image #> tokens
454
+ def count_unique_image_tokens(string):
455
+ pattern = r'<image\s\d+>'
456
+ matches = re.findall(pattern, string)
457
+ return len(set(matches))
458
+
459
+ # TextVQA
460
+ def prompt_processor(self, prompt):
461
+ if prompt.startswith('OCR tokens: '):
462
+ pattern = r"Question: (.*?) Short answer:"
463
+ match = re.search(pattern, prompt, re.DOTALL)
464
+ question = match.group(1)
465
+ elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
466
+ if prompt.startswith('Reference OCR token:'):
467
+ question = prompt.split('\n')[1]
468
+ else:
469
+ question = prompt.split('\n')[0]
470
+ elif len(prompt.split('\n')) == 2:
471
+ question = prompt.split('\n')[0]
472
+ else:
473
+ assert False
474
+
475
+ return question.lower()
476
+
477
+ # Convert answer to integer
478
+ def char_to_int(char):
479
+ return ord(char.upper()) - ord('A')
480
+
481
+ # In case model does not output a single letter, find the choice in answer
482
+ def convert_to_choice(answer, candidates):
483
+ options = ["A", "B", "C", "D", "E"]
484
+ if answer in options:
485
+ extracted_answer = answer
486
+ elif len(answer) >= 2 and answer[0] in options and "." in answer:
487
+ extracted_answer= answer[0]
488
+ else:
489
+ pattern = re.compile(r'The answer is ([A-Z]).')
490
+ res = pattern.findall(answer)
491
+ if len(res) == 1:
492
+ extracted_answer = res[0] # 'A', 'B', ...
493
+ else:
494
+ extracted_answer = "FAILED"
495
+
496
+ if extracted_answer in options[:len(candidates)]:
497
+ return options.index(extracted_answer)
498
+ else:
499
+ return -1
500
+
501
+ def get_pred_idx(prediction, choices, options):
502
+ """
503
+ Get the index (e.g. 2) from the prediction (e.g. 'C')
504
+ """
505
+ if prediction in options[:len(choices)]:
506
+ return options.index(prediction)
507
+ else:
508
+ return -1
509
+
510
+ # Chart QA
511
+ def relaxed_correctness(target: str,
512
+ prediction: str,
513
+ max_relative_change: float = 0.05) -> bool:
514
+ """Calculates relaxed correctness.
515
+
516
+ The correctness tolerates certain error ratio defined by max_relative_change.
517
+ See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
518
+ “Following Methani et al. (2020), we use a relaxed accuracy measure for the
519
+ numeric answers to allow a minor inaccuracy that may result from the automatic
520
+ data extraction process. We consider an answer to be correct if it is within
521
+ 5% of the gold answer. For non-numeric answers, we still need an exact match
522
+ to consider an answer to be correct.”
523
+
524
+ Args:
525
+ target: Target string.
526
+ prediction: Predicted string.
527
+ max_relative_change: Maximum relative change.
528
+
529
+ Returns:
530
+ Whether the prediction was correct given the specified tolerance.
531
+ """
532
+
533
+ def _to_float(text: str) -> Optional[float]:
534
+ try:
535
+ if text.endswith('%'):
536
+ # Convert percentages to floats.
537
+ return float(text.rstrip('%')) / 100.0
538
+ else:
539
+ return float(text)
540
+ except ValueError:
541
+ return None
542
+
543
+ prediction_float = _to_float(prediction)
544
+ target_float = _to_float(target)
545
+ if prediction_float is not None and target_float:
546
+ relative_change = abs(prediction_float -
547
+ target_float) / abs(target_float)
548
+ return relative_change <= max_relative_change
549
+ else:
550
+ return prediction.lower() == target.lower()
551
+
552
+ # MME
553
+ def get_gt(data_path):
554
+ ground_truth = {}
555
+ for category in os.listdir(data_path):
556
+ category_dir = os.path.join(data_path, category)
557
+ if not os.path.isdir(category_dir):
558
+ continue
559
+ if os.path.exists(os.path.join(category_dir, 'images')):
560
+ image_path = os.path.join(category_dir, 'images')
561
+ qa_path = os.path.join(category_dir, 'questions_answers_YN')
562
+ else:
563
+ image_path = qa_path = category_dir
564
+ assert os.path.isdir(image_path), image_path
565
+ assert os.path.isdir(qa_path), qa_path
566
+ for file in os.listdir(qa_path):
567
+ if not file.endswith('.txt'):
568
+ continue
569
+ for line in open(os.path.join(qa_path, file)):
570
+ question, answer = line.strip().split('\t')
571
+ ground_truth[(category, file, question)] = answer
572
+ return ground_truth
573
+
574
+ # Chart QA
575
+ def relaxed_correctness(target: str,
576
+ prediction: str,
577
+ max_relative_change: float = 0.05) -> bool:
578
+ """Calculates relaxed correctness.
579
+
580
+ The correctness tolerates certain error ratio defined by max_relative_change.
581
+ See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
582
+ “Following Methani et al. (2020), we use a relaxed accuracy measure for the
583
+ numeric answers to allow a minor inaccuracy that may result from the automatic
584
+ data extraction process. We consider an answer to be correct if it is within
585
+ 5% of the gold answer. For non-numeric answers, we still need an exact match
586
+ to consider an answer to be correct.”
587
+
588
+ Args:
589
+ target: Target string.
590
+ prediction: Predicted string.
591
+ max_relative_change: Maximum relative change.
592
+
593
+ Returns:
594
+ Whether the prediction was correct given the specified tolerance.
595
+ """
596
+
597
+ def _to_float(text: str) -> Optional[float]:
598
+ try:
599
+ if text.endswith('%'):
600
+ # Convert percentages to floats.
601
+ return float(text.rstrip('%'))
602
+ else:
603
+ return float(text)
604
+ except ValueError:
605
+ return None
606
+
607
+ prediction_float = _to_float(prediction)
608
+ target_float = _to_float(target)
609
+ if prediction_float is not None and target_float:
610
+ relative_change = abs(prediction_float -
611
+ target_float) / abs(target_float)
612
+ return relative_change <= max_relative_change
613
+ else:
614
+ return prediction.lower() == target.lower()
615
+
616
+ def evaluate_relaxed_accuracy(entries):
617
+ scores = []
618
+ for elem in entries:
619
+ if isinstance(elem['annotation'], str):
620
+ elem['annotation'] = [elem['annotation']]
621
+ score = max([
622
+ relaxed_correctness(elem['answer'].strip(), ann)
623
+ for ann in elem['annotation']
624
+ ])
625
+ scores.append(score)
626
+ return sum(scores) / len(scores)
627
+
628
+ # POPE
629
+ def eval_pope(answers, label_file):
630
+ label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
631
+
632
+ for answer in answers:
633
+ text = answer['answer'].lower()
634
+
635
+ # Only keep the first sentence
636
+ if text.find('.') != -1:
637
+ text = text.split('.')[0]
638
+
639
+ text = text.replace(',', '')
640
+ words = text.split(' ')
641
+ if 'No' in words or 'not' in words or 'no' in words:
642
+ answer['answer'] = 'no'
643
+ else:
644
+ answer['answer'] = 'yes'
645
+
646
+ for i in range(len(label_list)):
647
+ if label_list[i] == 'no':
648
+ label_list[i] = 0
649
+ else:
650
+ label_list[i] = 1
651
+
652
+ pred_list = []
653
+ for answer in answers:
654
+ if answer['answer'] == 'no':
655
+ pred_list.append(0)
656
+ else:
657
+ pred_list.append(1)
658
+
659
+ pos = 1
660
+ neg = 0
661
+
662
+ TP, TN, FP, FN = 0, 0, 0, 0
663
+ for pred, label in zip(pred_list, label_list):
664
+ if pred == pos and label == pos:
665
+ TP += 1
666
+ elif pred == pos and label == neg:
667
+ FP += 1
668
+ elif pred == neg and label == neg:
669
+ TN += 1
670
+ elif pred == neg and label == pos:
671
+ FN += 1
672
+
673
+ acc = (TP + TN) / (TP + TN + FP + FN)
674
+ return acc
675
+
676
+ # Eval GQA
677
+ # book to float
678
+ def toScore(b):
679
+ return float(1 if b else 0)
680
+
681
+ # Compute average of a list
682
+ def avg(l):
683
+ if len(l) == 0:
684
+ return 0
685
+ return float(sum(l)) / len(l)
686
+
687
+ def eval_gqa(predictions, questions):
688
+ # Initialize data structure to track all metrics: e.g. accuracy, validity and plausibility, as well as
689
+ # accuracy per question type, length and number of reasoning steps.
690
+ scores = {
691
+ "accuracy": [], # list of accuracies per question (1 if correct else 0). Will be averaged ultimately.
692
+ "binary": [], # list of accuracies per a binary question (1 if correct else 0). Will be averaged ultimately.
693
+ "open": [], # list of accuracies per an open question (1 if correct else 0). Will be averaged ultimately.
694
+ "validity": [], # list of validity per question (1 if valid else 0).
695
+ "plausibility": [], # list of plausibility per question (1 if plausible else 0).
696
+ "consistency": [], # list of consistency scores for entailed questions.
697
+ "accuracyPerStructuralType": defaultdict(list), # list of question accuracies for each structural type (e.g. compare, logic questions).
698
+ "accuracyPerSemanticType": defaultdict(list), # list of question accuracies for each semantic type (e.g. questions about an object, an attribute, a relation).
699
+ "accuracyPerLength": defaultdict(list), # list of question accuracies per question's word number.
700
+ "accuracyPerSteps": defaultdict(list), # list of question accuracies per question's reasoning length (steps number).
701
+ "grounding": [] # list of grounding scores for each question.
702
+ }
703
+
704
+ # Initialize golden and predicted histograms per each question group. Used to compute the distribution metric.
705
+ dist = {
706
+ "gold": defaultdict(lambda: defaultdict(int)),
707
+ "predicted": defaultdict(lambda: defaultdict(int))
708
+ }
709
+
710
+ ##### Question lengths - words numbers and reasoning steps number
711
+ ##########################################################################################
712
+
713
+ # Compute question length (words number)
714
+ def getWordsNum(question):
715
+ return len(question["question"].split())
716
+
717
+ # Compute number of reasoning steps (excluding the final "querying" step which doesn't increase effective reasoning length)
718
+ def getStepsNum(question):
719
+ return len([c for c in question["semantic"] if not (any([o in "{}: {}".format(c["operation"], c["argument"])
720
+ for o in ["exist", "query: name", "choose name"]]))])
721
+
722
+ ##### Main score computation
723
+ ##########################################################################################
724
+
725
+ # Loop over the questions and compute mterics
726
+ for qid, question in questions.items():
727
+ gold = question["answer"]
728
+ if qid not in predictions:
729
+ continue
730
+ predicted = predictions[qid].lower()
731
+
732
+ correct = (predicted == gold)
733
+ score = toScore(correct)
734
+
735
+ wordsNum = getWordsNum(question)
736
+ stepsNum = getStepsNum(question)
737
+
738
+ # Compute scores over the balanced dataset (more robust against cheating by making educated guesses)
739
+ if question["isBalanced"]:
740
+ # Update accuracy
741
+ scores["accuracy"].append(score)
742
+ scores["accuracyPerLength"][wordsNum].append(score)
743
+ scores["accuracyPerSteps"][stepsNum].append(score)
744
+ scores["accuracyPerStructuralType"][question["types"]["structural"]].append(score)
745
+ scores["accuracyPerSemanticType"][question["types"]["semantic"]].append(score)
746
+ answerType = "open" if question["types"]["structural"] == "query" else "binary"
747
+ scores[answerType].append(score)
748
+
749
+ # Update histograms for gold and predicted answers
750
+ globalGroup = question["groups"]["global"]
751
+ if globalGroup is not None:
752
+ dist["gold"][globalGroup][gold] += 1
753
+ dist["predicted"][globalGroup][predicted] += 1
754
+
755
+ # Average scores over all questions (in the balanced dataset) and print scores
756
+ metrics = [
757
+ "binary",
758
+ "open",
759
+ "accuracy",
760
+ "consistency",
761
+ "validity",
762
+ "plausibility",
763
+ "grounding",
764
+ ]
765
+
766
+ detailedMetrics = [
767
+ ("accuracyPerStructuralType", "Accuracy / structural type"),
768
+ ("accuracyPerSemanticType", "Accuracy / semantic type"),
769
+ ("accuracyPerSteps", "Accuracy / steps number"),
770
+ ("accuracyPerLength", "Accuracy / words number")
771
+ ]
772
+
773
+ subMetrics = {
774
+ "attr": "attribute",
775
+ "cat": "category",
776
+ "global": "scene",
777
+ "obj": "object",
778
+ "rel": "relation"
779
+ }
780
+ # average
781
+ for k in metrics:
782
+ if isinstance(scores[k], list):
783
+ scores[k] = avg(scores[k]) * 100
784
+
785
+ for k, _ in detailedMetrics:
786
+ for t in scores[k]:
787
+ scores[k][t] = avg(scores[k][t]) * 100, len(scores[k][t])
788
+
789
+ # print
790
+ print("")
791
+ for m in metrics:
792
+ # skip grounding and consistency scores if not requested
793
+ if m == "grounding":
794
+ continue
795
+ if m == "consistency":
796
+ continue
797
+
798
+ # print score
799
+ print("{title}: {score:.2f}{suffix}".format(title = m.capitalize(), score = scores[m],
800
+ suffix = " (lower is better)" if m == "distribution" else "%"))
801
+
802
+ for m, mPrintName in detailedMetrics:
803
+ print("")
804
+ # print metric title
805
+ print("{}:".format(mPrintName))
806
+
807
+ for t in sorted(list(scores[m].keys())):
808
+ # set sub-metric title
809
+ tName = t
810
+ if isinstance(scores[k], list):
811
+ tName = subMetrics.get(t, t).capitalize()
812
+
813
+ # print score
814
+ print(" {title}: {score:.2f}{suffix} ({amount} questions)".format(title = tName,
815
+ score = scores[m][t][0], suffix = "%", amount = scores[m][t][1]))
816
+
817
+ return scores
818
+
819
+ # MMMU
820
+ DOMAIN_CAT2SUB_CAT = {
821
+ 'Art and Design': ['Art', 'Art_Theory', 'Design', 'Music'],
822
+ 'Business': ['Accounting', 'Economics', 'Finance', 'Manage','Marketing'],
823
+ 'Science': ['Biology', 'Chemistry', 'Geography', 'Math', 'Physics',],
824
+ 'Health and Medicine': ['Basic_Medical_Science', 'Clinical_Medicine', 'Diagnostics_and_Laboratory_Medicine', 'Pharmacy', 'Public_Health'],
825
+ 'Humanities and Social Science': ['History', 'Literature', 'Sociology', 'Psychology'],
826
+ 'Tech and Engineering': ['Agriculture', 'Architecture_and_Engineering', 'Computer_Science', 'Electronics', 'Energy_and_Power', 'Materials', 'Mechanical_Engineering'],
827
+ }
828
+
829
+
830
+ CAT_SHORT2LONG = {
831
+ 'acc': 'Accounting',
832
+ 'agri': 'Agriculture',
833
+ 'arch': 'Architecture_and_Engineering',
834
+ 'art': 'Art',
835
+ 'art_theory': 'Art_Theory',
836
+ 'bas_med': 'Basic_Medical_Science',
837
+ 'bio': 'Biology',
838
+ 'chem': 'Chemistry',
839
+ 'cli_med': 'Clinical_Medicine',
840
+ 'cs': 'Computer_Science',
841
+ 'design': 'Design',
842
+ 'diag_med': 'Diagnostics_and_Laboratory_Medicine',
843
+ 'econ': 'Economics',
844
+ 'elec': 'Electronics',
845
+ 'ep': 'Energy_and_Power',
846
+ 'fin': 'Finance',
847
+ 'geo': 'Geography',
848
+ 'his': 'History',
849
+ 'liter': 'Literature',
850
+ 'manage': 'Manage',
851
+ 'mark': 'Marketing',
852
+ 'mate': 'Materials',
853
+ 'math': 'Math',
854
+ 'mech': 'Mechanical_Engineering',
855
+ 'music': 'Music',
856
+ 'phar': 'Pharmacy',
857
+ 'phys': 'Physics',
858
+ 'psy': 'Psychology',
859
+ 'pub_health': 'Public_Health',
860
+ 'socio': 'Sociology'
861
+ }
862
+
863
+ """Response Parsing and Evaluation for various models"""
864
+
865
+ # ----------- Process Multi-choice -------------
866
+ def parse_multi_choice_response(response, all_choices, index2ans):
867
+ """
868
+ Parse the prediction from the generated response.
869
+ Return the predicted index e.g., A, B, C, D.
870
+ """
871
+ for char in [',', '.', '!', '?', ';', ':', "'"]:
872
+ response = response.strip(char)
873
+ response = " " + response + " " # add space to avoid partial match
874
+
875
+ index_ans = True
876
+ ans_with_brack = False
877
+ candidates = []
878
+ for choice in all_choices: # e.g., (A) (B) (C) (D)
879
+ if f'({choice})' in response:
880
+ candidates.append(choice)
881
+ ans_with_brack = True
882
+
883
+ if len(candidates) == 0:
884
+ for choice in all_choices: # e.g., A B C D
885
+ if f' {choice} ' in response:
886
+ candidates.append(choice)
887
+
888
+ # if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
889
+ if len(candidates) == 0 and len(response.split()) > 5:
890
+ for index, ans in index2ans.items():
891
+ if ans.lower() in response.lower():
892
+ candidates.append(index)
893
+ index_ans = False # it's content ans.
894
+
895
+ if len(candidates) == 0: # still not get answer, randomly choose one.
896
+ pred_index = random.choice(all_choices)
897
+ elif len(candidates) > 1:
898
+ start_indexes = []
899
+ if index_ans:
900
+ if ans_with_brack:
901
+ for can in candidates:
902
+ index = response.rfind(f'({can})')
903
+ start_indexes.append(index) # -1 will be ignored anyway
904
+ # start_indexes = [generated_response.index(f'({can})') for can in candidates]
905
+ else:
906
+ for can in candidates:
907
+ index = response.rfind(f" {can} ")
908
+ start_indexes.append(index)
909
+ else:
910
+ for can in candidates:
911
+ index = response.lower().rfind(index2ans[can].lower())
912
+ start_indexes.append(index)
913
+ # get the last one
914
+ pred_index = candidates[np.argmax(start_indexes)]
915
+ else: # if only one candidate, use it.
916
+ pred_index = candidates[0]
917
+
918
+ return pred_index
919
+
920
+ # ----------- Process Open -------------
921
+ def check_is_number(string):
922
+ """
923
+ Check if the given string a number.
924
+ """
925
+ try:
926
+ float(string.replace(',', ''))
927
+ return True
928
+ except ValueError:
929
+ # check if there's comma inside
930
+ return False
931
+
932
+ def normalize_str(string):
933
+ """
934
+ Normalize the str to lower case and make them float numbers if possible.
935
+ """
936
+ # check if characters in the string
937
+
938
+ # if number, numerize it.
939
+ string = string.strip()
940
+
941
+ is_number = check_is_number(string)
942
+
943
+ if is_number:
944
+ string = string.replace(',', '')
945
+ string = float(string)
946
+ # leave 2 decimal
947
+ string = round(string, 2)
948
+ return [string]
949
+ else: # it's likely to be a string
950
+ # lower it
951
+ string = string.lower()
952
+ if len(string) == 1:
953
+ return [" " + string, string + " "] # avoid trivial matches
954
+ return [string]
955
+
956
+ def extract_numbers(string):
957
+ """
958
+ Exact all forms of numbers from a string with regex.
959
+ """
960
+ # Pattern for numbers with commas
961
+ pattern_commas = r'-?\b\d{1,3}(?:,\d{3})+\b'
962
+ # Pattern for scientific notation
963
+ pattern_scientific = r'-?\d+(?:\.\d+)?[eE][+-]?\d+'
964
+ # Pattern for simple numbers without commas
965
+ pattern_simple = r'-?(?:\d+\.\d+|\.\d+|\d+\b)(?![eE][+-]?\d+)(?![,\d])'
966
+
967
+ # Extract numbers with commas
968
+ numbers_with_commas = re.findall(pattern_commas, string)
969
+ # Extract numbers in scientific notation
970
+ numbers_scientific = re.findall(pattern_scientific, string)
971
+ # Extract simple numbers without commas
972
+ numbers_simple = re.findall(pattern_simple, string)
973
+
974
+ # Combine all extracted numbers
975
+ all_numbers = numbers_with_commas + numbers_scientific + numbers_simple
976
+ return all_numbers
977
+
978
+ def parse_open_response(response):
979
+ """
980
+ Parse the prediction from the generated response.
981
+ Return a list of predicted strings or numbers.
982
+ """
983
+ # content = content.strip("\n").strip(".").strip(" ")
984
+ def get_key_subresponses(response):
985
+ key_responses = []
986
+ response = response.strip().strip(".").lower()
987
+ sub_responses = re.split(r'\.\s(?=[A-Z])|\n', response)
988
+ indicators_of_keys = ['could be ', 'so ', 'is ',
989
+ 'thus ', 'therefore ', 'final ', 'answer ', 'result ']
990
+ key_responses = []
991
+ for index, resp in enumerate(sub_responses):
992
+ # if last one, accept it's an equation (the entire response can be just one sentence with equation)
993
+ if index == len(sub_responses) - 1:
994
+ indicators_of_keys.extend(['='])
995
+ shortest_key_response = None # the shortest response that may contain the answer (tail part of the response)
996
+ for indicator in indicators_of_keys:
997
+ if indicator in resp:
998
+ if not shortest_key_response:
999
+ shortest_key_response = resp.split(indicator)[-1].strip()
1000
+ else:
1001
+ if len(resp.split(indicator)[-1].strip()) < len(shortest_key_response):
1002
+ shortest_key_response = resp.split(indicator)[-1].strip()
1003
+ # key_responses.append(resp.split(indicator)[1].strip())
1004
+
1005
+ if shortest_key_response:
1006
+ # and it's not trivial
1007
+ if shortest_key_response.strip() not in [":", ",", ".", "!", "?", ";", ":", "'"]:
1008
+ key_responses.append(shortest_key_response)
1009
+ if len(key_responses) == 0: # did not found any
1010
+ return [response]
1011
+ return key_responses
1012
+ # pdb.set_trace()
1013
+ key_responses = get_key_subresponses(response)
1014
+
1015
+ pred_list = key_responses.copy() # keep the original string response
1016
+ for resp in key_responses:
1017
+ pred_list.extend(extract_numbers(resp))
1018
+
1019
+ tmp_pred_list = []
1020
+ for i in range(len(pred_list)):
1021
+ tmp_pred_list.extend(normalize_str(pred_list[i]))
1022
+ pred_list = tmp_pred_list
1023
+
1024
+ # remove duplicates
1025
+ pred_list = list(set(pred_list))
1026
+
1027
+ return pred_list
1028
+
1029
+ # ----------- Evaluation -------------
1030
+
1031
+ def eval_multi_choice(gold_i, pred_i):
1032
+ """
1033
+ Evaluate a multiple choice instance.
1034
+ """
1035
+ correct = False
1036
+ # only they are exactly the same, we consider it as correct
1037
+ if isinstance(gold_i, list):
1038
+ for answer in gold_i:
1039
+ if answer == pred_i:
1040
+ correct = True
1041
+ break
1042
+ else: # gold_i is a string
1043
+ if gold_i == pred_i:
1044
+ correct = True
1045
+ return correct
1046
+
1047
+ def eval_open(gold_i, pred_i):
1048
+ """
1049
+ Evaluate an open question instance
1050
+ """
1051
+ correct = False
1052
+ if isinstance(gold_i, list):
1053
+ # use float to avoid trivial matches
1054
+ norm_answers = []
1055
+ for answer in gold_i:
1056
+ norm_answers.extend(normalize_str(answer))
1057
+ else:
1058
+ norm_answers = normalize_str(gold_i)
1059
+ for pred in pred_i: # pred is already normalized in parse response phase
1060
+ if isinstance(pred, str): # if it's a string, then find if ans in the pred_i
1061
+ for norm_ans in norm_answers:
1062
+ # only see if the string answer in the string pred
1063
+ if isinstance(norm_ans, str) and norm_ans in pred:
1064
+ if not correct:
1065
+ correct = True
1066
+ break
1067
+ else: # it's a float number
1068
+ if pred in norm_answers:
1069
+ if not correct:
1070
+ correct = True
1071
+ break
1072
+ return correct
1073
+
1074
+ # ----------- Batch Evaluation -------------
1075
+ def evaluate(samples):
1076
+ """
1077
+ Batch evaluation for multiple choice and open questions.
1078
+ """
1079
+ pred_correct = 0
1080
+ judge_dict = dict()
1081
+ for sample in samples:
1082
+ gold_i = sample['answer']
1083
+ pred_i = sample['parsed_pred']
1084
+ if sample['question_type'] == 'multiple-choice':
1085
+ correct = eval_multi_choice(gold_i, pred_i)
1086
+ else: # open question
1087
+ correct = eval_open(gold_i, pred_i)
1088
+
1089
+ if correct:
1090
+ judge_dict[sample['id']] = 'Correct'
1091
+ pred_correct += 1
1092
+ else:
1093
+ judge_dict[sample['id']] = 'Wrong'
1094
+
1095
+ if len(samples) == 0:
1096
+ return {'acc': 0}
1097
+ return judge_dict, {'acc': pred_correct / len(samples)}
1098
+
1099
+
1100
+
1101
+ # ----------- Calculate Accuracy -------------
1102
+ def calculate_ins_level_acc(results: Dict):
1103
+ """Calculate the instruction level accuracy for given Subject results"""
1104
+ acc = 0
1105
+ ins_num = 0
1106
+ for cat_results in results.values():
1107
+ acc += cat_results['acc'] * cat_results['num_example']
1108
+ ins_num += cat_results['num_example']
1109
+ if ins_num == 0:
1110
+ return 0
1111
+ return acc / ins_num
1112
+
1113
+ def eval_mathverse(output_dir, results, extract_file, score_file):
1114
+ openai.api_key = OPENAI_KEY
1115
+
1116
+ demo_prompt_extract = """
1117
+ I am providing you a response from a model to a math problem, termed 'Model Response'. You should extract the answer from the response as 'Extracted Answer'. Directly output the extracted answer with no explanation.
1118
+
1119
+ 1.
1120
+ Model response: 'Rounded to two decimal places, the perimeter of the sector is approximately:\n\n(-2, 1)'
1121
+ Extracted Answer: (-2, 1)
1122
+
1123
+ 2.
1124
+ Model response: 'at those points.\n\nTherefore, the correct option that represents the meaning of the intersection points of the graphs is:\n\nD. They give the solutions to the equation $f(t)=g(t)$.",'
1125
+ Extracted Answer: D
1126
+
1127
+ 3.
1128
+ Model response: ' at 1 (there's a closed circle at y = 1), the range in interval notation is \\((-4, 1]\\).\n\nFinal values:\nDomain: \\((-3, 3]\\)\nRange: \\((-4, 1]\\)'
1129
+ Extracted Answer: Domain: \\((-3, 3]\\)\nRange: \\((-4, 1]\\)
1130
+
1131
+ 4.
1132
+ Model response: 'As it stands, I cannot provide the correct option letter because there isn't enough information to solve for 'y'.'
1133
+ Extracted Answer: null
1134
+
1135
+ 5.
1136
+ Model response: 'Given that AB = 17.6 meters, we can now substitute into the equation:\n\nd = 17.6 / cos(38\u00b0)\n\nTherefore, to one decimal place, the distance d between Ned and Bart is approximately 22.3 meters.'
1137
+ Extracted answer: 22.3
1138
+
1139
+ 6.
1140
+ Model response: have all the coefficients for the quadratic function:\n\\( f(x) = ax^2 + bx + c \\)\n\\( f(x) = -1x^2 - 2x + 1 \\)\n\nTherefore, the equation for the graphed function \\( f \\) is:\n\\( f(x) = -x^2 - 2x + 1 \\)"'
1141
+ Extracted answer: f(x) = -x^2 - 2x + 1
1142
+
1143
+ 7.
1144
+ """
1145
+ demo_prompt_score = """
1146
+ Below are two answers to a math question. Question is [Question], [Standard Answer] is the standard answer to the question, and [Model_answer] is the answer extracted from a model's output to this question. Determine whether these two answers are consistent.
1147
+ Please note that only when the [Model_answer] completely matches the [Standard Answer] means they are consistent. For non-multiple-choice questions, if the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
1148
+ If they are consistent, Judement is 1; if they are different, Judement is 0.
1149
+
1150
+ [Question]: Write the set of numbers represented on the number line in interval notation.
1151
+ [Standard Answer]: (-2,1]
1152
+ [Model_answer] : Extracted Answer: \\((-2, 1)\\)
1153
+ Judgement: 0
1154
+
1155
+ [Question]: As shown in the figure, circle O has a radius 1.0, if angle BAC = 60.0, then the length of BC is ()\nChoices:\nA:2\nB:2\u221a{{3}}\nC:\u221a{{3}}\nD:2\u221a{{2}}
1156
+ [Standard Answer]: C
1157
+ [Model_answer] : B:2\u221a{{3}}
1158
+ Judgement: 0
1159
+
1160
+ [Question]: Find the domain and range of the function f using interval notation.
1161
+ [Standard Answer]: domain: [-4, 0) and range: (-3, 1]
1162
+ [Model_answer] : Range: \\((-4, 1]\\)
1163
+ Judgement: 0
1164
+
1165
+ [Question]: As shown in the figure, circle O has a radius 1.0, if angle BAC = 60.0, then the length of BC is ()\nChoices:\nA:2\nB:2\u221a{{3}}\nC:\u221a{{3}}\nD:2\u221a{{2}}
1166
+ [Standard Answer]: C
1167
+ [Model_answer] : null
1168
+ Judgement: 0
1169
+
1170
+ [Question]: Given the graph of the ellipse that intersects with x-axis at 9 and -9 and with y-axis at 3 and -3, determine its equation.A. \\frac{{x^2}}{{81}} + \\frac{{y^2}}{{9}} = 1 B. Can not determine.\n
1171
+ [Standard Answer]: A
1172
+ [Model_answer] : \\frac{{x^2}}{{81}} + \\frac{{y^2}}{{9}} = 1
1173
+ Judgement: 1
1174
+
1175
+ [Question]: {question}
1176
+ [Standard Answer]: {gt}
1177
+ [Model_answer] : {extraction}
1178
+ Judgement: """
1179
+
1180
+ def create_extract_prompt(demo_prompt, response, inst):
1181
+ demo_prompt = demo_prompt.strip()
1182
+ test_prompt = f"Model response: '{response}'\nExtracted Answer: "
1183
+ full_prompt = f"{demo_prompt}\n\n{test_prompt}"
1184
+ return full_prompt
1185
+
1186
+ def create_scoring_prompt(demo_prompt, inst):
1187
+ demo_prompt = demo_prompt.strip()
1188
+ full_prompt = demo_prompt.format(question = inst['question'], gt=inst['answer'], extraction=inst['extraction'])
1189
+ return full_prompt
1190
+
1191
+
1192
+ def extract_answer(response, inst, api_key):
1193
+ # general extraction
1194
+ try:
1195
+ full_prompt = create_extract_prompt(demo_prompt_extract, response, inst)
1196
+ extraction = get_chat_response(full_prompt, api_key)
1197
+ return extraction
1198
+ except Exception as e:
1199
+ print(e)
1200
+ print(f"Error in extracting answer for {response}")
1201
+ return ""
1202
+
1203
+ def match_answer(inst, api_key):
1204
+ try:
1205
+ full_prompt = create_scoring_prompt(demo_prompt_score, inst)
1206
+ extraction = get_chat_response(full_prompt, api_key)
1207
+ return extraction.replace("Judgement:", "").strip()
1208
+ except Exception as e:
1209
+ print(e)
1210
+ print(f"Error in matching answer")
1211
+
1212
+ return ""
1213
+
1214
+ save_results = []
1215
+ score_dict = defaultdict(lambda: defaultdict(list))
1216
+ score_version_dict = defaultdict(list)
1217
+
1218
+ for i, inst in enumerate(tqdm(results)):
1219
+ response = inst['model_answer']
1220
+ extraction = extract_answer(response, inst, OPENAI_KEY)
1221
+ inst['extraction'] = extraction.replace('Extracted Answer: ', '').strip()
1222
+
1223
+ judgement = match_answer(inst, OPENAI_KEY)
1224
+ while True:
1225
+ if judgement.strip() not in ['0', '1']:
1226
+ print('Wrong return format: ', judgement)
1227
+ judgement = match_answer(inst, OPENAI_KEY)
1228
+ else:
1229
+ inst['judgement'] = int(judgement)
1230
+ break
1231
+
1232
+ save_results.append(inst)
1233
+
1234
+ score_dict[inst['metadata']['subject']][inst['metadata']['subfield']].append(inst['judgement'])
1235
+ score_version_dict[inst['problem_version']].append(inst['judgement'])
1236
+
1237
+ results_file = os.path.join(output_dir, extract_file)
1238
+ with open(results_file, 'w') as f:
1239
+ json.dump(save_results, f, indent=4)
1240
+
1241
+ print(f"Save MathVerse Results at {results_file}")
1242
+
1243
+ save_json = {}
1244
+ # version level acc
1245
+ total_cnt, right_cnt = 0, 0
1246
+ for version in score_version_dict:
1247
+ version_total_cnt = len(score_version_dict[version])
1248
+ version_right_cnt = len([inst for inst in score_version_dict[version] if inst == 1])
1249
+ total_cnt += version_total_cnt
1250
+ right_cnt += version_right_cnt
1251
+ print(f"{version} Acc: {(version_right_cnt/version_total_cnt):.3f}")
1252
+ save_json[version] = f"{(version_right_cnt/version_total_cnt):.3f}"
1253
+
1254
+ print(f"Acc: {(right_cnt/total_cnt):.3f}")
1255
+
1256
+ save_json["Total Acc"] = f"{(right_cnt/total_cnt):.3f}"
1257
+
1258
+ scores_file = os.path.join(output_dir, score_file)
1259
+ with open(scores_file, 'w') as f:
1260
+ json.dump(save_json, f, indent=4)
1261
+
1262
+
1263
+ def eval_mmstar(eval_file, output_dir, score_file):
1264
+ MMStar_score_l2 = {
1265
+ 'coarse perception': {
1266
+ 'image scene and topic': 0,
1267
+ 'image style & quality': 0,
1268
+ 'image emotion': 0
1269
+ },
1270
+ 'fine-grained perception': {
1271
+ 'object counting': 0,
1272
+ 'recognition': 0,
1273
+ 'localization': 0
1274
+ },
1275
+ 'instance reasoning': {
1276
+ 'single-instance reasoning': 0,
1277
+ 'cross-instance attribute reasoning': 0,
1278
+ 'cross-instance relation reasoning': 0
1279
+ },
1280
+ 'logical reasoning': {
1281
+ 'code & sequence reasoning': 0,
1282
+ 'diagram reasoning': 0,
1283
+ 'common reasoning': 0
1284
+ },
1285
+ 'science & technology': {
1286
+ 'biology & chemistry & physics': 0,
1287
+ 'electronics & energy & mechanical eng.': 0,
1288
+ 'geography & earth science & agriculture': 0
1289
+ },
1290
+ 'math': {
1291
+ 'geometry': 0,
1292
+ 'numeric commonsense and calculation': 0,
1293
+ 'statistical reasoning': 0
1294
+ },
1295
+ }
1296
+ MMStar_counter = deepcopy(MMStar_score_l2)
1297
+
1298
+ data = eval_file
1299
+ lt = len(data)
1300
+ lines = [data.iloc[i] for i in range(lt)]
1301
+ for i in tqdm(range(len(lines))):
1302
+ line = lines[i]
1303
+ predict = str(line['prediction'])
1304
+ answers = str(line['answer'])
1305
+ # ori_bench = str(line['bench'])
1306
+ category = str(line['category'])
1307
+ l2_category = str(line['l2_category'])
1308
+ MMStar_counter[category][l2_category] += 1
1309
+
1310
+ answer = answers.lower().strip().replace('\n', ' ')
1311
+ predict = predict.lower().strip().replace('\n', ' ')
1312
+ # if ori_bench == 'MathVista' and answer not in ['a', 'b', 'c', 'd']:
1313
+ # if answer in predict:
1314
+ # MMStar_score_l2[category][l2_category] += 1
1315
+ # else:
1316
+ try:
1317
+ if answer == predict[0]:
1318
+ MMStar_score_l2[category][l2_category] += 1
1319
+ elif predict[0] == '(' and answer == predict[1]:
1320
+ MMStar_score_l2[category][l2_category] += 1
1321
+ elif predict[0:7] == 'option ' and answer == predict[7]:
1322
+ MMStar_score_l2[category][l2_category] += 1
1323
+ elif predict[0:14] == 'the answer is ' and answer == predict[14]:
1324
+ MMStar_score_l2[category][l2_category] += 1
1325
+ except Exception as e:
1326
+ pass
1327
+
1328
+ MMStar_score = {}
1329
+ MMStar_score['final score'] = 0
1330
+ for k, v in MMStar_score_l2.items():
1331
+ MMStar_score[k] = 0
1332
+ for l2_k, l2_v in v.items():
1333
+ if float(MMStar_counter[k][l2_k]) == 0:
1334
+ MMStar_score[f'{k}({l2_k})'] = 0
1335
+ else:
1336
+ MMStar_score[f'{k}({l2_k})'] = float(l2_v) / \
1337
+ float(MMStar_counter[k][l2_k])
1338
+ MMStar_score[k] += l2_v
1339
+ MMStar_score['final score'] += MMStar_score[k]
1340
+ MMStar_score[k] = float(MMStar_score[k]) / 250.0
1341
+ MMStar_score['final score'] = float(MMStar_score['final score']) / 1500.0
1342
+
1343
+ score_pth = os.path.join(output_dir, score_file)
1344
+ with open(score_pth, 'w') as f:
1345
+ json.dump(MMStar_score, f, indent=4)
1346
+
1347
+ print(
1348
+ f'MMStar_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
1349
+ print('Score: ')
1350
+ for key, value in MMStar_score.items():
1351
+ print('{}:{}'.format(key, value))
loader/create_eval_dataset.py ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import math
4
+ import glob
5
+ from config import *
6
+ from PIL import Image
7
+ import pandas as pd
8
+ import pyarrow.parquet as pq
9
+ import torch.nn.functional as F
10
+ from eval.utils import *
11
+ from torch.utils.data import Dataset
12
+ from torchvision.transforms.functional import pil_to_tensor
13
+
14
+ class CreateEvalDataset(Dataset):
15
+ def __init__(self):
16
+ super(CreateEvalDataset, self).__init__()
17
+
18
+ """
19
+ Eval Datasets
20
+
21
+ - VQAv2
22
+ - GQA
23
+ - SQA-IMG
24
+ - VizWiz
25
+ - TextVQA
26
+ - POPE
27
+ - MME
28
+ - MMBench
29
+ - MMBench-CN
30
+ - QBench
31
+ - MM-Vet
32
+ - MMMU
33
+ - MathVista
34
+ - AI2D
35
+ - HallusionBench
36
+ - ChartQA
37
+ - SEED
38
+ - LLaVA Wild
39
+ - BLINK
40
+ - MathVerse
41
+
42
+ """
43
+
44
+ # dataset root path
45
+ self.dataset_root_path = DATASET_ROOT
46
+
47
+ # load test data
48
+ pre_vqav2 = json.load(open(os.path.join(DATASET_ROOT, VQAV2)))
49
+ pre_gqa = json.load(open(os.path.join(DATASET_ROOT, GQA)))
50
+ pre_sqa = json.load(open(os.path.join(DATASET_ROOT, SQA)))
51
+ pre_sqa_split = json.load(open(os.path.join(DATASET_ROOT, SQA_SPLIT)))
52
+ pre_vizwiz = json.load(open(os.path.join(DATASET_ROOT, VIZWIZ)))
53
+ pre_textvqa = json.load(open(os.path.join(DATASET_ROOT, TEXTVQA)))
54
+ pre_textvqa_annotations = json.load(open(os.path.join(DATASET_ROOT, TEXTVQA_ANNOTATIONS)))
55
+ pre_pope_popular = pd.read_json(os.path.join(DATASET_ROOT, POPE_POPULAR), lines=True)
56
+ pre_pope_adversarial= pd.read_json(os.path.join(DATASET_ROOT, POPE_ADVERSARIAL), lines=True)
57
+ pre_pope_random = pd.read_json(os.path.join(DATASET_ROOT, POPE_RANDOM), lines=True)
58
+ pre_mme = json.load(open(os.path.join(DATASET_ROOT, MME)))
59
+ pre_mmbench = pd.read_table(os.path.join(DATASET_ROOT, MMBENCH))
60
+ pre_mmbench_dev = pd.read_table(os.path.join(DATASET_ROOT, MMBENCH_DEV))
61
+ pre_mmbench_cn = pd.read_table(os.path.join(DATASET_ROOT, MMBENCH_CN))
62
+ pre_mmbench_cn_dev = pd.read_table(os.path.join(DATASET_ROOT, MMBENCH_CN_DEV))
63
+ pre_qbench = json.load(open(os.path.join(DATASET_ROOT, QBENCH)))
64
+ pre_qbench_cn = json.load(open(os.path.join(DATASET_ROOT, QBENCH_CN)))
65
+ pre_mmvet = json.load(open(os.path.join(DATASET_ROOT, MMVET)))
66
+ mmmu_files = glob.glob(os.path.join(DATASET_ROOT, MMMU))
67
+ pre_mmmu = [pq.read_pandas(os.path.join(DATASET_ROOT, mf)).to_pandas() for mf in mmmu_files]
68
+ pre_mathvista1 = pq.read_pandas(os.path.join(DATASET_ROOT, MATHVISTA)).to_pandas()
69
+ pre_ai2d = json.load(open(os.path.join(DATASET_ROOT, AI2D)))
70
+ pre_hallusionbench = json.load(open(os.path.join(DATASET_ROOT, HALLUSIONBENCH)))
71
+ pre_chartqa = json.load(open(os.path.join(DATASET_ROOT, CHARTQA)))
72
+ pre_seed = json.load(open(os.path.join(DATASET_ROOT, SEED)))
73
+ pre_llava = pd.read_json(os.path.join(DATASET_ROOT, LLAVA), lines=True)
74
+ # pre_blink = json.load(open(os.path.join(DATASET_ROOT, BLINK)))
75
+ pre_mathverse = json.load(open(os.path.join(DATASET_ROOT, MATHVERSE)))
76
+ pre_mathverse_text_only = json.load(open(os.path.join(DATASET_ROOT, MATHVERSE_TEXT_ONLY)))
77
+ pre_mmstar = pq.read_pandas(os.path.join(DATASET_ROOT, MMSTAR)).to_pandas()
78
+
79
+ # data filtering
80
+ vqav2 = self.vqav2_filtering(pre_vqav2)
81
+ gqa = self.gqa_filtering(pre_gqa)
82
+ sqa = self.sqa_filtering(pre_sqa, pre_sqa_split)
83
+ vizwiz = self.vizwiz_filtering(pre_vizwiz)
84
+ textvqa = self.textvqa_filtering(pre_textvqa, pre_textvqa_annotations)
85
+ pope = self.pope_filtering([pre_pope_popular, pre_pope_adversarial, pre_pope_random])
86
+ mme = self.mme_filtering(pre_mme)
87
+ mmbench = self.mmbench_filtering(pre_mmbench)
88
+ mmbench_dev = self.mmbench_filtering(pre_mmbench_dev)
89
+ mmbench_cn = self.mmbench_filtering(pre_mmbench_cn)
90
+ mmbench_cn_dev = self.mmbench_filtering(pre_mmbench_cn_dev)
91
+ qbench = self.qbench_filtering(pre_qbench)
92
+ qbench_cn = self.qbench_filtering(pre_qbench_cn)
93
+ mmvet = self.mmvet_filtering(pre_mmvet)
94
+ mmmu = self.mmmu_filtering(pre_mmmu)
95
+ mathvista = self.mathvista_filtering(pre_mathvista1)
96
+ ai2d = self.ai2d_filtering(pre_ai2d)
97
+ hallusionbench = self.hallusionbench_filtering(pre_hallusionbench)
98
+ chartqa = self.chartqa_filtering(pre_chartqa)
99
+ seed = self.seed_filtering(pre_seed)
100
+ llava = self.llava_filtering(pre_llava)
101
+ # blink = self.blink_filtering(pre_blink)
102
+ mathverse = self.mathverse_filtering(pre_mathverse, pre_mathverse_text_only)
103
+ mmstar = self.mmstar_filtering(pre_mmstar)
104
+
105
+ # merging
106
+ self.data = {
107
+ 'vqav2': vqav2,
108
+ 'gqa': gqa,
109
+ 'sqa':sqa,
110
+ 'vizwiz': vizwiz,
111
+ 'textvqa': textvqa,
112
+ 'pope': pope,
113
+ 'mme': mme,
114
+ 'mmbench': mmbench,
115
+ 'mmbench_dev': mmbench_dev,
116
+ 'mmbench_cn': mmbench_cn,
117
+ 'mmbench_cn_dev': mmbench_cn_dev,
118
+ 'qbench': qbench,
119
+ 'mm-vet': mmvet,
120
+ 'mmmu': mmmu,
121
+ 'mathvista': mathvista,
122
+ 'ai2d': ai2d,
123
+ 'hallusionbench': hallusionbench,
124
+ 'chartqa': chartqa,
125
+ 'seed': seed,
126
+ 'llava': llava,
127
+ # 'blink': blink,
128
+ 'mathverse' : mathverse,
129
+ 'mmstar' : mmstar
130
+ }
131
+
132
+
133
+ def vqav2_filtering(self, pre_data):
134
+ data = []
135
+ for x in pre_data['questions']:
136
+ data.append({'image': f"VQAv2/test2015/COCO_test2015_{x['image_id']:012d}.jpg",
137
+ 'question': x['question'],
138
+ 'id': x['question_id']})
139
+ return data
140
+
141
+ def gqa_filtering(self, pre_data):
142
+ data = []
143
+ for qid, x in pre_data.items():
144
+ data.append({'image': f"gqa/images/{x['imageId']}.jpg",
145
+ 'question': x['question'],
146
+ 'id': qid})
147
+ return data
148
+
149
+ def sqa_filtering(self, pre_data, pre_sqa_split):
150
+ data = []
151
+ questions = {idx: pre_data[idx] for idx in pre_sqa_split['test']}
152
+ for qid, x in questions.items():
153
+ if x['image'] is not None:
154
+ choices = '\n'.join(f"{chr(ord('A') + i)}. {choice}" for i, choice in enumerate(x['choices']))
155
+ question = '\n'.join([x['hint'], x['question'], choices])
156
+ data.append({'image': f"ScienceQA/images/test/{qid}/image.png",
157
+ 'question': question,
158
+ 'id': qid,
159
+ 'candidates': x['choices'],
160
+ 'gt': x['answer']})
161
+ return data
162
+
163
+ def vizwiz_filtering(self, pre_data):
164
+ data = []
165
+ for qid, x in enumerate(pre_data):
166
+ data.append({'image': f"VizWiz/test/{x['image']}",
167
+ 'question': x['question'],
168
+ 'id': qid})
169
+ return data
170
+
171
+ def textvqa_filtering(self, pre_data, annotations):
172
+ data = []
173
+ for x, answer in zip(pre_data, annotations['data']):
174
+ data.append({'image': f"TextVQA/train_images/{x['image']}",
175
+ 'question': x['text'],
176
+ 'id': x['question_id'],
177
+ 'gt': answer['answers']})
178
+ return data
179
+
180
+ def pope_filtering(self, pre_data):
181
+ data = []
182
+ categories = ['adversarial', 'popular', 'random']
183
+ for category, split in zip(categories, pre_data):
184
+ for _, x in split.iterrows():
185
+ data.append({'image': f"coco2014/val2014/{x['image']}",
186
+ 'question': x['text'],
187
+ 'id': x['question_id'],
188
+ 'category': category})
189
+ return data
190
+
191
+ def mme_filtering(self, pre_data):
192
+ data = []
193
+ for x in pre_data:
194
+ data.append({'image': f"MME_Benchmark_release_version/{x['image']}",
195
+ 'question': x['text'],
196
+ 'id': x['question_id'],
197
+ 'category': x['category']})
198
+ return data
199
+
200
+ def mmbench_filtering(self, pre_data):
201
+ data = []
202
+ for _, x in pre_data.iterrows():
203
+ options = ['A', 'B', 'C', 'D']
204
+ choice_list = [choice for choice in options if not self.is_none(x[choice])]
205
+ choices = '\n'.join(f"{chr(ord('A') + i)}. {x[choice]}" for i, choice in enumerate(choice_list))
206
+ question = '\n'.join([x['question'], choices])
207
+
208
+ if not self.is_none(x['hint']):
209
+ question = '\n'.join([x['hint'], question])
210
+
211
+ data.append({'image': x['image'],
212
+ 'question': question,
213
+ 'id': x['index'],
214
+ 'answer': x['answer'] if 'answer' in x else None})
215
+ return data
216
+
217
+ def qbench_filtering(self, pre_data):
218
+ data = []
219
+ for qid, x in enumerate(pre_data):
220
+ choices = '\n'.join(f"{chr(ord('A') + i)}. {choice}" for i, choice in enumerate(x['candidates']))
221
+ question = '\n'.join([x['question'], choices])
222
+ data.append({'image': f"LLVisionQA-QBench/images/{x['img_path']}",
223
+ 'question': question,
224
+ 'id': qid,
225
+ 'candidates': x['candidates'],
226
+ 'gt': x['correct_ans']})
227
+ return data
228
+
229
+ def mmvet_filtering(self, pre_data):
230
+ data = []
231
+ for qid, x in pre_data.items():
232
+ data.append({'image': f"mm-vet/images/{x['imagename']}",
233
+ 'question': x['question'],
234
+ 'id': qid,
235
+ 'gt': x['answer'],
236
+ 'capability': x['capability']})
237
+ return data
238
+
239
+ def mmmu_filtering(self, pre_data):
240
+ data = []
241
+ for split in pre_data:
242
+ for _, x in split.iterrows():
243
+ index2ans, all_choices = self.get_multi_choice_info(eval(x['options']))
244
+ choices = ' '.join([f"{k}. {v}" for k,v in index2ans.items()])
245
+ question = '\n'.join([x['question'], choices])
246
+ num_images = count_unique_image_tokens(question)
247
+ data.append({'images': [x[f"image_{i+1}"]['bytes'] for i in range(num_images)],
248
+ 'question': replace_image_tokens(question),
249
+ 'id': x['id'],
250
+ 'question_type': x['question_type'],
251
+ 'gt': x['answer'],
252
+ 'index2ans': index2ans,
253
+ 'all_choices': all_choices})
254
+ return data
255
+
256
+ def mathvista_filtering(self, pre_data):
257
+ data = []
258
+ for _, x in pre_data.iterrows():
259
+ skills = x['metadata']['skills'].tolist()
260
+ x['metadata']['skills'] = skills
261
+ choices = x['choices'].tolist() if x['choices'] is not None else None
262
+ data.append({'image': f"MathVista/{x['image']}",
263
+ 'question': x['query'],
264
+ 'question_type': x['question_type'],
265
+ 'answer': x['answer'],
266
+ 'answer_type': x['answer_type'],
267
+ 'choices': choices,
268
+ 'metadata': x['metadata'],
269
+ 'precision': x['precision'],
270
+ 'id': x['pid']})
271
+ return data
272
+
273
+ def ai2d_filtering(self, pre_data):
274
+ data = []
275
+ for x in pre_data:
276
+ choices = ' '.join(f"{chr(ord('A') + i)}. {choice}" for i, choice in enumerate(x["metadata"]["answerTexts"]))
277
+ question = '\n'.join([x['question'], choices])
278
+ image = f"ai2d/abc_images/{x['imageName']}" if x['metadata']['abcLabel'] else f"ai2d/images/{x['imageName']}"
279
+ data.append({'image': image,
280
+ 'question': question,
281
+ 'id': x['metadata']['questionId'],
282
+ 'gt': x['metadata']['correctAnswer']})
283
+ return data
284
+
285
+ def hallusionbench_filtering(self, pre_data):
286
+ data = []
287
+ for qid, x in enumerate(pre_data):
288
+ if x['filename'] is None:
289
+ img_path = ""
290
+ question = x['question']
291
+ else:
292
+ img_path = f"HallusionBench/hallusion_bench/{x['filename'][2:]}".format()
293
+ question = "<image>" + x['question']
294
+ data.append({'image': img_path,
295
+ 'question': question,
296
+ 'id': qid,
297
+ 'gt': x['gt_answer']})
298
+ return data
299
+
300
+ def chartqa_filtering(self, pre_data):
301
+ data = []
302
+ for qid, x in enumerate(pre_data):
303
+ data.append({'image': f"chartqa/test/png/{x['imgname']}",
304
+ 'question': x['query'],
305
+ 'id': x['imgname'],
306
+ 'gt': x['label']})
307
+ return data
308
+
309
+ def seed_filtering(self, pre_data):
310
+ data = []
311
+ for x in pre_data['questions']:
312
+ if x['data_type'] != 'image':
313
+ continue
314
+ choice_list = [key for key in x.keys() if 'choice' in key]
315
+ choices = '\n'.join(f"{chr(ord('A') + i)}. {x[choice]}" for i, choice in enumerate(choice_list))
316
+ question = '\n'.join([x['question'], choices])
317
+ data.append({'image': f"SEED-Bench/SEED-Bench-image/{x['data_id']}",
318
+ 'question': question,
319
+ 'id': x['question_id'],
320
+ 'question_type': x['question_type_id'],
321
+ 'gt': x['answer']})
322
+ return data
323
+
324
+ def llava_filtering(self, pre_data):
325
+ data = []
326
+ for _, x in pre_data.iterrows():
327
+ data.append({'image': f"llava-bench-in-the-wild/images/{x['image']}",
328
+ 'question': x['text'],
329
+ 'id': x['question_id'],
330
+ "category": x['category']})
331
+ return data
332
+
333
+ def blink_filtering(self, pre_data):
334
+ data = []
335
+ # TODO
336
+ return data
337
+
338
+ def mathverse_filtering(self, pre_data, pre_data_text_only):
339
+ data = []
340
+ for x in pre_data:
341
+ data.append({'image': f"MathVerse/images/{x['image']}",
342
+ 'question': "<image>" + x['query_wo'],
343
+ # 'question': "<image>" + x['query_cot'],
344
+ 'id': x['sample_index'],
345
+ 'problem_index': x['problem_index'],
346
+ 'problem_version': x['problem_version'],
347
+ 'gt' : x['answer'],
348
+ 'question_type': x['question_type'],
349
+ 'metadata' : x['metadata'],
350
+ 'query_cot' : x['query_cot'],
351
+ 'origin_question': x['question']
352
+ })
353
+ offset = len(pre_data)
354
+ for x in pre_data_text_only:
355
+ data.append({'image': "",
356
+ 'question': x['query_wo'],
357
+ # 'question': x['query_cot'],
358
+ 'id': str(int(x['sample_index']) + offset),
359
+ 'problem_index': x['problem_index'],
360
+ 'problem_version': x['problem_version'],
361
+ 'gt' : x['answer'],
362
+ 'question_type': x['question_type'],
363
+ 'metadata' : x['metadata'],
364
+ 'query_cot' : x['query_cot'],
365
+ 'origin_question': x['question']
366
+ })
367
+
368
+ return data
369
+
370
+ def is_none(self, value):
371
+ return type(value) is float and math.isnan(value)
372
+
373
+ def get_options(self, row, options):
374
+ parsed_options = []
375
+ for option in options:
376
+ option_value = row[option]
377
+ if self.is_none(option_value):
378
+ break
379
+ parsed_options.append(option_value)
380
+ return parsed_options
381
+
382
+ def __len__(self):
383
+ return len(self.data)
384
+
385
+ def get_multi_choice_info(self, options):
386
+ """
387
+ Given the list of options for multiple choice question
388
+ Return the index2ans and all_choices
389
+ """
390
+
391
+ start_chr = 'A'
392
+ all_choices = []
393
+ index2ans = {}
394
+ for i, option in enumerate(options):
395
+ index2ans[chr(ord(start_chr) + i)] = option
396
+ all_choices.append(chr(ord(start_chr) + i))
397
+
398
+ return index2ans, all_choices
399
+
400
+ def mmstar_filtering(self, pre_data):
401
+ data = []
402
+ for _, x in pre_data.iterrows():
403
+ data.append({'id' : x['index'],
404
+ 'question': x['question'],
405
+ 'answer': x['answer'],
406
+ 'category': x['category'],
407
+ 'l2_category': x['l2_category'],
408
+ # 'bench': x['bench'],
409
+ 'image': x['image']})
410
+ return data
meteor/arch/build_module.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from transformers import CLIPVisionModel
6
+
7
+
8
+ def build_vision_tower():
9
+ vision_tower = 'openai/clip-vit-large-patch14-336'
10
+ return CLIPVisionTower(vision_tower)
11
+
12
+
13
+ def build_vision_projector():
14
+ projector_type = 'mlp2x_gelu'
15
+ mm_hidden_size = 1024
16
+ hidden_size = 4096
17
+
18
+ mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
19
+ if mlp_gelu_match:
20
+ mlp_depth = int(mlp_gelu_match.group(1))
21
+ modules = [nn.Linear(mm_hidden_size, hidden_size)]
22
+ for _ in range(1, mlp_depth):
23
+ modules.append(nn.GELU())
24
+ modules.append(nn.Linear(hidden_size, hidden_size))
25
+ return nn.Sequential(*modules)
26
+
27
+ if projector_type == 'identity':
28
+ return IdentityMap()
29
+
30
+ raise ValueError(f'Unknown projector type: {projector_type}')
31
+
32
+
33
+ class IdentityMap(nn.Module):
34
+
35
+ def __init__(self):
36
+ super().__init__()
37
+
38
+ def forward(self, x, *args, **kwargs):
39
+ return x
40
+
41
+ @property
42
+ def config(self):
43
+ return {'mm_projector_type': 'identity'}
44
+
45
+
46
+ class CLIPVisionTower(nn.Module):
47
+
48
+ def __init__(self, vision_tower):
49
+ super().__init__()
50
+
51
+ self.is_loaded = False
52
+ self.is_resize_pos = False
53
+
54
+ self.vision_tower_name = vision_tower
55
+ self.select_layer = -1
56
+ self.select_feature = 'patch'
57
+ self.load_model()
58
+ self.resize_pos()
59
+
60
+ def load_model(self):
61
+ self.vision_tower = CLIPVisionModel.from_pretrained(
62
+ self.vision_tower_name)
63
+ self.vision_tower.requires_grad_(False)
64
+
65
+ self.is_loaded = True
66
+
67
+ def resize_pos(self):
68
+ pos_embed_checkpoint = self.vision_tower.vision_model.embeddings.position_embedding.weight
69
+ pos_embed_checkpoint = pos_embed_checkpoint.unsqueeze(0)
70
+ orig_size = 24
71
+ new_size = 35
72
+
73
+ if pos_embed_checkpoint.shape[1] == new_size**2 + 1:
74
+ self.is_resize_pos = True
75
+ else:
76
+ embedding_size = pos_embed_checkpoint.shape[-1]
77
+ num_extra_tokens = 1
78
+ new_num = new_size**2 + num_extra_tokens
79
+ # print('Position interpolate from %dx%d to %dx%d' %
80
+ # (orig_size, orig_size, new_size, new_size))
81
+ extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
82
+ # only the position tokens are interpolated
83
+ pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
84
+ pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size,
85
+ embedding_size).permute(
86
+ 0, 3, 1, 2)
87
+ pos_tokens = torch.nn.functional.interpolate(
88
+ pos_tokens,
89
+ size=(new_size, new_size),
90
+ mode='bicubic',
91
+ align_corners=False)
92
+ pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
93
+ new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
94
+
95
+ new_pos_embed = new_pos_embed.squeeze(0)
96
+
97
+ self.vision_tower.vision_model.embeddings.position_embedding = torch.nn.Embedding(
98
+ new_num, 1024)
99
+ self.vision_tower.vision_model.embeddings.position_embedding.weight = torch.nn.Parameter(
100
+ new_pos_embed.to(pos_embed_checkpoint.dtype))
101
+ self.vision_tower.vision_model.embeddings.position_ids = torch.arange(
102
+ new_num).expand((1, -1))
103
+
104
+ self.is_resize_pos = True
105
+
106
+ def feature_select(self, image_forward_outs):
107
+ image_features = image_forward_outs.hidden_states[self.select_layer]
108
+ if self.select_feature == 'patch':
109
+ image_features = image_features[:, 1:]
110
+ elif self.select_feature == 'cls_patch':
111
+ image_features = image_features
112
+ else:
113
+ raise ValueError(
114
+ f'Unexpected select feature: {self.select_feature}')
115
+ return image_features
116
+
117
+ def forward(self, images):
118
+ if not self.is_loaded:
119
+ self.load_model()
120
+ if type(images) is list:
121
+ image_features = []
122
+ for image in images:
123
+ image_forward_out = self.vision_tower(
124
+ image.to(device=self.device,
125
+ dtype=self.dtype).unsqueeze(0),
126
+ output_hidden_states=True)
127
+ image_feature = self.feature_select(image_forward_out).to(
128
+ image.dtype)
129
+ image_features.append(image_feature)
130
+ else:
131
+ image_forward_outs = self.vision_tower(
132
+ images.to(device=self.device, dtype=self.dtype),
133
+ output_hidden_states=True)
134
+ image_features = self.feature_select(image_forward_outs).to(
135
+ images.dtype)
136
+
137
+ return image_features
138
+
139
+ @property
140
+ def dummy_feature(self):
141
+ return torch.zeros(
142
+ 1, self.hidden_size, device=self.device, dtype=self.dtype)
143
+
144
+ @property
145
+ def dtype(self):
146
+ return self.vision_tower.dtype
147
+
148
+ @property
149
+ def device(self):
150
+ return self.vision_tower.device
151
+
152
+ @property
153
+ def config(self):
154
+ if self.is_loaded:
155
+ return self.vision_tower.config
156
+ else:
157
+ return self.cfg_only
158
+
159
+ @property
160
+ def hidden_size(self):
161
+ return self.config.hidden_size
162
+
163
+ @property
164
+ def num_patches(self):
165
+ return (self.config.image_size // self.config.patch_size)**2
166
+
167
+
168
+ class LoRA(nn.Module):
169
+
170
+ def __init__(self,
171
+ in_features: int,
172
+ out_features: int,
173
+ bias: bool = True,
174
+ device=None,
175
+ dtype=None,
176
+ lora_r=8,
177
+ lora_alpha=16,
178
+ lora_dropout=0.05,
179
+ lora_len=0,
180
+ **kwargs) -> None:
181
+ super().__init__()
182
+ self.lora_r = lora_r
183
+ self.lora_alpha = lora_alpha
184
+ self.lora_len = lora_len
185
+ if lora_dropout > 0.:
186
+ self.lora_dropout = nn.Dropout(p=lora_dropout)
187
+ else:
188
+ self.lora_dropout = lambda x: x
189
+ self.lora_scaling = self.lora_alpha / self.lora_r
190
+
191
+ self.lora_A = nn.Linear(
192
+ in_features, self.lora_r, bias=False, device=device, dtype=dtype)
193
+ self.lora_B = nn.Linear(
194
+ self.lora_r, out_features, bias=False, device=device, dtype=dtype)
195
+ self.ffn = nn.Linear(in_features, out_features, bias=bias, device=device, dtype=dtype)
196
+
197
+ def forward(self, x, im_mask=None):
198
+ res = self.ffn(x)
199
+ if im_mask is not None:
200
+ if torch.sum(im_mask) > 0:
201
+ part_x = x[im_mask]
202
+ res[im_mask] += self.lora_B(
203
+ self.lora_A(
204
+ self.lora_dropout(part_x))) * self.lora_scaling
205
+ else:
206
+ part_x = x[:, :1]
207
+ res[:, :1] += self.lora_B(
208
+ self.lora_A(self.lora_dropout(part_x))) * 0
209
+ return res
meteor/arch/configuration_internlm2.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) InternLM. All rights reserved.
2
+ #
3
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
4
+ # and OPT implementations in this library. It has been modified from its
5
+ # original forms to accommodate minor architectural differences compared
6
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License");
9
+ # you may not use this file except in compliance with the License.
10
+ # You may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
+ # See the License for the specific language governing permissions and
18
+ # limitations under the License.
19
+ """InternLM model configuration."""
20
+
21
+ from transformers.configuration_utils import PretrainedConfig
22
+ from transformers.utils import logging
23
+
24
+ logger = logging.get_logger(__name__)
25
+
26
+ INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
27
+
28
+
29
+ class InternLM2Config(PretrainedConfig):
30
+ r"""
31
+ This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate
32
+ an InternLM model according to the specified arguments, defining the model architecture. Instantiating a
33
+ configuration with the defaults will yield a similar configuration to that of the InternLM-7B.
34
+
35
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
36
+ documentation from [`PretrainedConfig`] for more information.
37
+
38
+
39
+ Args:
40
+ vocab_size (`int`, *optional*, defaults to 32000):
41
+ Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the
42
+ `inputs_ids` passed when calling [`InternLMModel`]
43
+ hidden_size (`int`, *optional*, defaults to 4096):
44
+ Dimension of the hidden representations.
45
+ intermediate_size (`int`, *optional*, defaults to 11008):
46
+ Dimension of the MLP representations.
47
+ num_hidden_layers (`int`, *optional*, defaults to 32):
48
+ Number of hidden layers in the Transformer encoder.
49
+ num_attention_heads (`int`, *optional*, defaults to 32):
50
+ Number of attention heads for each attention layer in the Transformer encoder.
51
+ num_key_value_heads (`int`, *optional*):
52
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
53
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
54
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
55
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
56
+ by meanpooling all the original heads within that group. For more details checkout [this
57
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
58
+ `num_attention_heads`.
59
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
60
+ The non-linear activation function (function or string) in the decoder.
61
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
62
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
63
+ just in case (e.g., 512 or 1024 or 2048).
64
+ initializer_range (`float`, *optional*, defaults to 0.02):
65
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
66
+ rms_norm_eps (`float`, *optional*, defaults to 1e-12):
67
+ The epsilon used by the rms normalization layers.
68
+ use_cache (`bool`, *optional*, defaults to `True`):
69
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
70
+ relevant if `config.is_decoder=True`.
71
+ tie_word_embeddings(`bool`, *optional*, defaults to `False`):
72
+ Whether to tie weight embeddings
73
+ Example:
74
+
75
+ ```python
76
+ >>> from transformers import InternLMModel, InternLMConfig
77
+
78
+ >>> # Initializing a InternLM internlm-7b style configuration
79
+ >>> configuration = InternLMConfig()
80
+
81
+ >>> # Initializing a model from the internlm-7b style configuration
82
+ >>> model = InternLMModel(configuration)
83
+
84
+ >>> # Accessing the model configuration
85
+ >>> configuration = model.config
86
+ ```"""
87
+ model_type = 'internlm'
88
+ _auto_class = 'AutoConfig'
89
+
90
+ def __init__( # pylint: disable=W0102
91
+ self,
92
+ vocab_size=103168,
93
+ hidden_size=4096,
94
+ intermediate_size=11008,
95
+ num_hidden_layers=32,
96
+ num_attention_heads=32,
97
+ num_key_value_heads=None,
98
+ hidden_act='silu',
99
+ max_position_embeddings=2048,
100
+ initializer_range=0.02,
101
+ rms_norm_eps=1e-6,
102
+ use_cache=True,
103
+ pad_token_id=0,
104
+ bos_token_id=1,
105
+ eos_token_id=2,
106
+ tie_word_embeddings=False,
107
+ bias=True,
108
+ rope_theta=10000,
109
+ rope_scaling=None,
110
+ attn_implementation='eager',
111
+ **kwargs,
112
+ ):
113
+ self.vocab_size = vocab_size
114
+ self.max_position_embeddings = max_position_embeddings
115
+ self.hidden_size = hidden_size
116
+ self.intermediate_size = intermediate_size
117
+ self.num_hidden_layers = num_hidden_layers
118
+ self.num_attention_heads = num_attention_heads
119
+ self.bias = bias
120
+
121
+ if num_key_value_heads is None:
122
+ num_key_value_heads = num_attention_heads
123
+ self.num_key_value_heads = num_key_value_heads
124
+
125
+ self.hidden_act = hidden_act
126
+ self.initializer_range = initializer_range
127
+ self.rms_norm_eps = rms_norm_eps
128
+ self.use_cache = use_cache
129
+ self.rope_theta = rope_theta
130
+ self.rope_scaling = rope_scaling
131
+ self._rope_scaling_validation()
132
+
133
+ self.attn_implementation = attn_implementation
134
+ if self.attn_implementation is None:
135
+ self.attn_implementation = 'eager'
136
+ super().__init__(
137
+ pad_token_id=pad_token_id,
138
+ bos_token_id=bos_token_id,
139
+ eos_token_id=eos_token_id,
140
+ tie_word_embeddings=tie_word_embeddings,
141
+ **kwargs,
142
+ )
143
+
144
+ def _rope_scaling_validation(self):
145
+ """Validate the `rope_scaling` configuration."""
146
+ if self.rope_scaling is None:
147
+ return
148
+
149
+ if not isinstance(self.rope_scaling,
150
+ dict) or len(self.rope_scaling) != 2:
151
+ raise ValueError(
152
+ '`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, '
153
+ f'got {self.rope_scaling}')
154
+ rope_scaling_type = self.rope_scaling.get('type', None)
155
+ rope_scaling_factor = self.rope_scaling.get('factor', None)
156
+ if rope_scaling_type is None or rope_scaling_type not in [
157
+ 'linear', 'dynamic'
158
+ ]:
159
+ raise ValueError(
160
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
161
+ )
162
+ if rope_scaling_factor is None or not isinstance(
163
+ rope_scaling_factor, float) or rope_scaling_factor < 1.0:
164
+ raise ValueError(
165
+ f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}"
166
+ )
meteor/arch/modeling_internlm2.py ADDED
@@ -0,0 +1,1080 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # # Copyright (c) InternLM. All rights reserved.
2
+ #
3
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
4
+ # and OPT implementations in this library. It has been modified from its
5
+ # original forms to accommodate minor architectural differences compared
6
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License");
9
+ # you may not use this file except in compliance with the License.
10
+ # You may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
+ # See the License for the specific language governing permissions and
18
+ # limitations under the License.
19
+ """PyTorch InternLM2 model."""
20
+ import math
21
+ import warnings
22
+ from typing import List, Optional, Tuple, Union
23
+
24
+ import torch
25
+ import torch.nn.functional as F
26
+ import torch.utils.checkpoint
27
+ from einops import rearrange
28
+ from torch import nn
29
+ from transformers.activations import ACT2FN
30
+ from transformers.modeling_outputs import BaseModelOutputWithPast
31
+ from transformers.modeling_utils import PreTrainedModel
32
+ from transformers.utils import (add_start_docstrings,
33
+ add_start_docstrings_to_model_forward, logging)
34
+
35
+ try:
36
+ from transformers.generation.streamers import BaseStreamer
37
+ except: # noqa # pylint: disable=bare-except
38
+ BaseStreamer = None
39
+
40
+ from .build_module import LoRA
41
+ from .configuration_internlm2 import InternLM2Config
42
+
43
+ logger = logging.get_logger(__name__)
44
+
45
+ _CONFIG_FOR_DOC = 'InternLM2Config'
46
+ flash_attn_func, flash_attn_varlen_func = None, None
47
+ pad_input, index_first_axis, unpad_input = None, None, None
48
+ def _import_flash_attn():
49
+ global flash_attn_func, flash_attn_varlen_func
50
+ global pad_input, index_first_axis, unpad_input
51
+ try:
52
+ from flash_attn import flash_attn_func as _flash_attn_func, flash_attn_varlen_func as _flash_attn_varlen_func
53
+ from flash_attn.bert_padding import pad_input as _pad_input, index_first_axis as _index_first_axis, unpad_input as _unpad_input
54
+ flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func
55
+ pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input
56
+ except ImportError:
57
+ raise ImportError("flash_attn is not installed.")
58
+
59
+ # Copied from transformers.models.llama.modeling_llama._get_unpad_data
60
+ def _get_unpad_data(attention_mask):
61
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
62
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
63
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
64
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
65
+ return (
66
+ indices,
67
+ cu_seqlens,
68
+ max_seqlen_in_batch,
69
+ )
70
+
71
+ # Copied from transformers.models.bart.modeling_bart._make_causal_mask
72
+ def _make_causal_mask(input_ids_shape: torch.Size,
73
+ dtype: torch.dtype,
74
+ device: torch.device,
75
+ past_key_values_length: int = 0):
76
+ """Make causal mask used for bi-directional self-attention."""
77
+ bsz, tgt_len = input_ids_shape
78
+ mask = torch.full((tgt_len, tgt_len),
79
+ torch.tensor(torch.finfo(dtype).min, device=device),
80
+ device=device)
81
+ mask_cond = torch.arange(mask.size(-1), device=device)
82
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
83
+ mask = mask.to(dtype)
84
+
85
+ if past_key_values_length > 0:
86
+ mask = torch.cat([
87
+ torch.zeros(
88
+ tgt_len, past_key_values_length, dtype=dtype, device=device),
89
+ mask
90
+ ],
91
+ dim=-1)
92
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len,
93
+ tgt_len + past_key_values_length)
94
+
95
+
96
+ # Copied from transformers.models.bart.modeling_bart._expand_mask
97
+ def _expand_mask(mask: torch.Tensor,
98
+ dtype: torch.dtype,
99
+ tgt_len: Optional[int] = None):
100
+ """Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len,
101
+ src_seq_len]`."""
102
+ bsz, src_len = mask.size()
103
+ tgt_len = tgt_len if tgt_len is not None else src_len
104
+
105
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len,
106
+ src_len).to(dtype)
107
+
108
+ inverted_mask = 1.0 - expanded_mask
109
+
110
+ return inverted_mask.masked_fill(
111
+ inverted_mask.to(torch.bool),
112
+ torch.finfo(dtype).min)
113
+
114
+
115
+ class InternLM2RMSNorm(nn.Module):
116
+
117
+ def __init__(self, hidden_size, eps=1e-6):
118
+ """InternLM2RMSNorm is equivalent to T5LayerNorm."""
119
+ super().__init__()
120
+ self.weight = nn.Parameter(torch.ones(hidden_size))
121
+ self.variance_epsilon = eps
122
+
123
+ def forward(self, hidden_states):
124
+ input_dtype = hidden_states.dtype
125
+ hidden_states = hidden_states.to(torch.float32)
126
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
127
+ hidden_states = hidden_states * torch.rsqrt(variance +
128
+ self.variance_epsilon)
129
+ return self.weight * hidden_states.to(input_dtype)
130
+
131
+
132
+ class InternLM2RotaryEmbedding(nn.Module):
133
+
134
+ def __init__(self,
135
+ dim,
136
+ max_position_embeddings=2048,
137
+ base=10000,
138
+ device=None):
139
+ super().__init__()
140
+
141
+ self.dim = dim
142
+ self.max_position_embeddings = max_position_embeddings
143
+ self.base = base
144
+ inv_freq = 1.0 / (
145
+ self.base
146
+ **(torch.arange(0, self.dim, 2).float().to(device) / self.dim))
147
+ self.register_buffer('inv_freq', inv_freq, persistent=False)
148
+
149
+ # Build here to make `torch.jit.trace` work.
150
+ self._set_cos_sin_cache(
151
+ seq_len=max_position_embeddings,
152
+ device=self.inv_freq.device,
153
+ dtype=torch.get_default_dtype())
154
+
155
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
156
+ self.max_seq_len_cached = seq_len
157
+ t = torch.arange(
158
+ self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
159
+
160
+ freqs = torch.einsum('i,j->ij', t, self.inv_freq)
161
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
162
+ emb = torch.cat((freqs, freqs), dim=-1)
163
+ self.register_buffer(
164
+ 'cos_cached', emb.cos().to(dtype), persistent=False)
165
+ self.register_buffer(
166
+ 'sin_cached', emb.sin().to(dtype), persistent=False)
167
+
168
+ def forward(self, x, seq_len=None):
169
+ # x: [bs, num_attention_heads, seq_len, head_size]
170
+ if seq_len > self.max_seq_len_cached:
171
+ self._set_cos_sin_cache(
172
+ seq_len=seq_len, device=x.device, dtype=x.dtype)
173
+
174
+ return (
175
+ self.cos_cached[:seq_len].to(dtype=x.dtype),
176
+ self.sin_cached[:seq_len].to(dtype=x.dtype),
177
+ )
178
+
179
+
180
+ class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding):
181
+ """InternLM2RotaryEmbedding extended with linear scaling.
182
+
183
+ Credits to the Reddit user /u/kaiokendev
184
+ """
185
+
186
+ def __init__(self,
187
+ dim,
188
+ max_position_embeddings=2048,
189
+ base=10000,
190
+ device=None,
191
+ scaling_factor=1.0):
192
+ self.scaling_factor = scaling_factor
193
+ super().__init__(dim, max_position_embeddings, base, device)
194
+
195
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
196
+ self.max_seq_len_cached = seq_len
197
+ t = torch.arange(
198
+ self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
199
+ t = t / self.scaling_factor
200
+
201
+ freqs = torch.einsum('i,j->ij', t, self.inv_freq)
202
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
203
+ emb = torch.cat((freqs, freqs), dim=-1)
204
+ self.register_buffer(
205
+ 'cos_cached', emb.cos().to(dtype), persistent=False)
206
+ self.register_buffer(
207
+ 'sin_cached', emb.sin().to(dtype), persistent=False)
208
+
209
+
210
+ class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding):
211
+ """InternLM2RotaryEmbedding extended with Dynamic NTK scaling.
212
+
213
+ Credits to the Reddit users /u/bloc97 and /u/emozilla.
214
+ """
215
+
216
+ def __init__(self,
217
+ dim,
218
+ max_position_embeddings=2048,
219
+ base=10000,
220
+ device=None,
221
+ scaling_factor=1.0):
222
+ self.scaling_factor = scaling_factor
223
+ super().__init__(dim, max_position_embeddings, base, device)
224
+
225
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
226
+ self.max_seq_len_cached = seq_len
227
+
228
+ if seq_len > self.max_position_embeddings:
229
+ base = self.base * ((self.scaling_factor * seq_len /
230
+ self.max_position_embeddings) -
231
+ (self.scaling_factor - 1))**(
232
+ self.dim / (self.dim - 2))
233
+ inv_freq = 1.0 / (
234
+ base
235
+ **(torch.arange(0, self.dim, 2).float().to(device) / self.dim))
236
+ self.register_buffer('inv_freq', inv_freq, persistent=False)
237
+
238
+ t = torch.arange(
239
+ self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
240
+
241
+ freqs = torch.einsum('i,j->ij', t, self.inv_freq)
242
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
243
+ emb = torch.cat((freqs, freqs), dim=-1)
244
+ self.register_buffer(
245
+ 'cos_cached', emb.cos().to(dtype), persistent=False)
246
+ self.register_buffer(
247
+ 'sin_cached', emb.sin().to(dtype), persistent=False)
248
+
249
+
250
+ def rotate_half(x):
251
+ """Rotates half the hidden dims of the input."""
252
+ x1 = x[..., :x.shape[-1] // 2]
253
+ x2 = x[..., x.shape[-1] // 2:]
254
+ return torch.cat((-x2, x1), dim=-1)
255
+
256
+
257
+ # def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
258
+ # # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
259
+ # cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
260
+ # sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
261
+ # cos = cos.unsqueeze(0).unsqueeze(0).expand(len(position_ids), -1, -1, -1)
262
+ # sin = sin.unsqueeze(0).unsqueeze(0).expand(len(position_ids), -1, -1, -1)
263
+ # if q.size(2) == 1:
264
+ # q_embed = (q * cos[:, :, -1:, :]) + (
265
+ # rotate_half(q) * sin[:, :, -1:, :])
266
+ # else:
267
+ # q_embed = (q * cos) + (rotate_half(q) * sin)
268
+
269
+ # if k.size(2) == 1:
270
+ # k_embed = (k * cos[:, :, -1:, :]) + (
271
+ # rotate_half(k) * sin[:, :, -1:, :])
272
+ # else:
273
+ # k_embed = (k * cos) + (rotate_half(k) * sin)
274
+
275
+ # return q_embed, k_embed
276
+
277
+ # Copied from transformers.model.llama.modeling_llama.apply_rotary_pos_emb
278
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
279
+ """Applies Rotary Position Embedding to the query and key tensors."""
280
+ cos = cos[position_ids].unsqueeze(unsqueeze_dim)
281
+ sin = sin[position_ids].unsqueeze(unsqueeze_dim)
282
+ q_embed = (q * cos) + (rotate_half(q) * sin)
283
+ k_embed = (k * cos) + (rotate_half(k) * sin)
284
+ return q_embed, k_embed
285
+
286
+
287
+ class InternLM2MLP(nn.Module):
288
+
289
+ def __init__(self, config):
290
+ super().__init__()
291
+ self.config = config
292
+ self.hidden_size = config.hidden_size
293
+ self.intermediate_size = config.intermediate_size
294
+
295
+ self.w1 = LoRA(
296
+ self.hidden_size,
297
+ self.intermediate_size,
298
+ bias=False,
299
+ lora_r=256,
300
+ lora_alpha=256,
301
+ lora_len=576)
302
+ self.w3 = LoRA(
303
+ self.hidden_size,
304
+ self.intermediate_size,
305
+ bias=False,
306
+ lora_r=256,
307
+ lora_alpha=256,
308
+ lora_len=576)
309
+ self.w2 = LoRA(
310
+ self.intermediate_size,
311
+ self.hidden_size,
312
+ bias=False,
313
+ lora_r=256,
314
+ lora_alpha=256,
315
+ lora_len=576)
316
+
317
+ self.act_fn = ACT2FN[config.hidden_act]
318
+
319
+ def forward(self, x, im_mask):
320
+ down_proj = self.w2(
321
+ self.act_fn(self.w1(x, im_mask)) * self.w3(x, im_mask), im_mask)
322
+
323
+ return down_proj
324
+
325
+
326
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
327
+ """This is the equivalent of torch.repeat_interleave(x, dim=1,
328
+ repeats=n_rep).
329
+
330
+ The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to
331
+ (batch, num_attention_heads, seqlen, head_dim)
332
+ """
333
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
334
+ if n_rep == 1:
335
+ return hidden_states
336
+ hidden_states = hidden_states[:, :,
337
+ None, :, :].expand(batch,
338
+ num_key_value_heads,
339
+ n_rep, slen, head_dim)
340
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen,
341
+ head_dim)
342
+
343
+
344
+ class InternLM2Attention(nn.Module):
345
+ """Multi-headed attention from 'Attention Is All You Need' paper."""
346
+
347
+ def __init__(self, config: InternLM2Config):
348
+ super().__init__()
349
+ self.config = config
350
+ self.hidden_size = config.hidden_size
351
+ self.num_heads = config.num_attention_heads
352
+ self.head_dim = self.hidden_size // self.num_heads
353
+ self.num_key_value_heads = config.num_key_value_heads
354
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
355
+ self.max_position_embeddings = config.max_position_embeddings
356
+ self.is_causal = True
357
+
358
+ if (self.head_dim * self.num_heads) != self.hidden_size:
359
+ raise ValueError(
360
+ f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}'
361
+ f' and `num_heads`: {self.num_heads}).')
362
+
363
+ self.wqkv = LoRA(
364
+ self.hidden_size,
365
+ (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim,
366
+ bias=config.bias,
367
+ lora_r=256,
368
+ lora_alpha=256,
369
+ lora_len=576)
370
+
371
+ self.wo = LoRA(
372
+ self.num_heads * self.head_dim,
373
+ self.hidden_size,
374
+ bias=config.bias,
375
+ lora_r=256,
376
+ lora_alpha=256,
377
+ lora_len=576)
378
+ self._init_rope()
379
+
380
+ def _init_rope(self):
381
+ if self.config.rope_scaling is None:
382
+ self.rotary_emb = InternLM2RotaryEmbedding(
383
+ self.head_dim,
384
+ max_position_embeddings=self.max_position_embeddings,
385
+ base=self.config.rope_theta,
386
+ )
387
+ else:
388
+ scaling_type = self.config.rope_scaling['type']
389
+ scaling_factor = self.config.rope_scaling['factor']
390
+ if scaling_type == 'dynamic':
391
+ self.rotary_emb = InternLM2DynamicNTKScalingRotaryEmbedding(
392
+ self.head_dim,
393
+ max_position_embeddings=self.max_position_embeddings,
394
+ base=self.config.rope_theta,
395
+ scaling_factor=scaling_factor)
396
+ else:
397
+ raise ValueError(
398
+ "Currently we only support rotary embedding's type being 'dynamic'."
399
+ )
400
+ return self.rotary_emb
401
+
402
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
403
+ return tensor.view(bsz, seq_len, self.num_heads,
404
+ self.head_dim).transpose(1, 2).contiguous()
405
+
406
+ def forward(
407
+ self,
408
+ hidden_states: torch.Tensor,
409
+ attention_mask: Optional[torch.Tensor] = None,
410
+ position_ids: Optional[torch.LongTensor] = None,
411
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
412
+ output_attentions: bool = False,
413
+ use_cache: bool = False,
414
+ im_mask: Optional[Tuple[torch.Tensor]] = None,
415
+ **kwargs,
416
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
417
+ Optional[Tuple[torch.Tensor]]]:
418
+ if 'padding_mask' in kwargs:
419
+ warnings.warn(
420
+ 'Passing `padding_mask` is deprecated and will be removed in v4.37. '
421
+ 'Please make sure use `attention_mask` instead.`')
422
+
423
+ bsz, q_len, _ = hidden_states.size()
424
+
425
+ qkv_states = self.wqkv(hidden_states, im_mask)
426
+
427
+ qkv_states = rearrange(
428
+ qkv_states,
429
+ 'b q (h gs d) -> b q h gs d',
430
+ gs=2 + self.num_key_value_groups,
431
+ d=self.head_dim,
432
+ )
433
+
434
+ query_states = qkv_states[..., :self.num_key_value_groups, :]
435
+ query_states = rearrange(query_states, 'b q h gs d -> b q (h gs) d')
436
+ key_states = qkv_states[..., -2, :]
437
+ value_states = qkv_states[..., -1, :]
438
+
439
+ query_states = query_states.transpose(1, 2)
440
+ key_states = key_states.transpose(1, 2)
441
+ value_states = value_states.transpose(1, 2)
442
+
443
+ kv_seq_len = key_states.shape[-2]
444
+ if past_key_value is not None:
445
+ kv_seq_len += past_key_value[0].shape[-2]
446
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
447
+ query_states, key_states = apply_rotary_pos_emb(
448
+ query_states, key_states, cos, sin, position_ids)
449
+
450
+ if past_key_value is not None:
451
+ # reuse k, v, self_attention
452
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
453
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
454
+
455
+ past_key_value = (key_states, value_states) if use_cache else None
456
+
457
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
458
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
459
+
460
+ attn_weights = torch.matmul(query_states, key_states.transpose(
461
+ 2, 3)) / math.sqrt(self.head_dim)
462
+
463
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
464
+ raise ValueError(
465
+ f'Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is'
466
+ f' {attn_weights.size()}')
467
+
468
+ if attention_mask is not None:
469
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
470
+ raise ValueError(
471
+ f'Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}'
472
+ )
473
+ attn_weights = attn_weights + attention_mask
474
+
475
+ # upcast attention to fp32
476
+ attn_weights = nn.functional.softmax(
477
+ attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
478
+ attn_output = torch.matmul(attn_weights, value_states)
479
+
480
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
481
+ raise ValueError(
482
+ f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is'
483
+ f' {attn_output.size()}')
484
+
485
+ attn_output = attn_output.transpose(1, 2).contiguous()
486
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
487
+
488
+ attn_output = self.wo(attn_output, im_mask)
489
+
490
+ if not output_attentions:
491
+ attn_weights = None
492
+
493
+ return attn_output, attn_weights, past_key_value
494
+
495
+
496
+ class InternLM2FlashAttention2(InternLM2Attention):
497
+ """InternLM2 flash attention module.
498
+
499
+ This module inherits from `InternLM2Attention` as the weights of the module
500
+ stays untouched. The only required change would be on the forward pass
501
+ where it needs to correctly call the public API of flash attention and deal
502
+ with padding tokens in case the input contains any of them.
503
+ """
504
+
505
+ def forward(
506
+ self,
507
+ hidden_states: torch.Tensor,
508
+ attention_mask: Optional[torch.LongTensor] = None,
509
+ position_ids: Optional[torch.LongTensor] = None,
510
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
511
+ output_attentions: bool = False,
512
+ use_cache: bool = False,
513
+ im_mask: Optional[Tuple[torch.Tensor]] = None,
514
+ **kwargs,
515
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
516
+ Optional[Tuple[torch.Tensor]]]:
517
+ # InternLM2FlashAttention2 attention does not support output_attentions
518
+ if 'padding_mask' in kwargs:
519
+ warnings.warn(
520
+ 'Passing `padding_mask` is deprecated and will be removed in v4.37. '
521
+ 'Please make sure use `attention_mask` instead.`')
522
+
523
+ # overwrite attention_mask with padding_mask
524
+ attention_mask = kwargs.pop('padding_mask')
525
+
526
+ output_attentions = False
527
+
528
+ bsz, q_len, _ = hidden_states.size()
529
+
530
+ qkv_states = self.wqkv(hidden_states, im_mask)
531
+
532
+ qkv_states = rearrange(
533
+ qkv_states,
534
+ 'b q (h gs d) -> b q h gs d',
535
+ gs=2 + self.num_key_value_groups,
536
+ d=self.head_dim,
537
+ q=q_len,
538
+ )
539
+
540
+ query_states = qkv_states[..., :self.num_key_value_groups, :]
541
+ query_states = rearrange(query_states, 'b q h gs d -> b q (h gs) d')
542
+ key_states = qkv_states[..., -2, :]
543
+ value_states = qkv_states[..., -1, :]
544
+ query_states = query_states.transpose(1, 2)
545
+ key_states = key_states.transpose(1, 2)
546
+ value_states = value_states.transpose(1, 2)
547
+
548
+ kv_seq_len = key_states.shape[-2]
549
+ if past_key_value is not None:
550
+ kv_seq_len += past_key_value[0].shape[-2]
551
+
552
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
553
+
554
+ query_states, key_states = apply_rotary_pos_emb(
555
+ query_states, key_states, cos, sin, position_ids)
556
+
557
+ if past_key_value is not None:
558
+ # reuse k, v, self_attention
559
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
560
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
561
+
562
+ past_key_value = (key_states, value_states) if use_cache else None
563
+
564
+ query_states = query_states.transpose(1, 2)
565
+ key_states = key_states.transpose(1, 2)
566
+ value_states = value_states.transpose(1, 2)
567
+
568
+ attn_output = self._flash_attention_forward(
569
+ query_states,
570
+ key_states,
571
+ value_states,
572
+ attention_mask,
573
+ q_len)
574
+
575
+ attn_output = attn_output.reshape(bsz, q_len,
576
+ self.hidden_size).contiguous()
577
+ attn_output = self.wo(attn_output, im_mask)
578
+
579
+ if not output_attentions:
580
+ attn_weights = None
581
+
582
+ return attn_output, attn_weights, past_key_value
583
+
584
+ def _flash_attention_forward(
585
+ self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
586
+ ):
587
+ """
588
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
589
+ first unpad the input, then computes the attention scores and pad the final attention scores.
590
+ Args:
591
+ query_states (`torch.Tensor`):
592
+ Input query states to be passed to Flash Attention API
593
+ key_states (`torch.Tensor`):
594
+ Input key states to be passed to Flash Attention API
595
+ value_states (`torch.Tensor`):
596
+ Input value states to be passed to Flash Attention API
597
+ attention_mask (`torch.Tensor`):
598
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
599
+ position of padding tokens and 1 for the position of non-padding tokens.
600
+ dropout (`int`, *optional*):
601
+ Attention dropout
602
+ softmax_scale (`float`, *optional*):
603
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
604
+ """
605
+ # Contains at least one padding token in the sequence
606
+ causal = self.is_causal and query_length != 1
607
+ if attention_mask is not None:
608
+ batch_size = query_states.shape[0]
609
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._unpad_input(
610
+ query_states, key_states, value_states, attention_mask, query_length
611
+ )
612
+
613
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
614
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
615
+
616
+ attn_output_unpad = flash_attn_varlen_func(
617
+ query_states,
618
+ key_states,
619
+ value_states,
620
+ cu_seqlens_q=cu_seqlens_q,
621
+ cu_seqlens_k=cu_seqlens_k,
622
+ max_seqlen_q=max_seqlen_in_batch_q,
623
+ max_seqlen_k=max_seqlen_in_batch_k,
624
+ dropout_p=dropout,
625
+ softmax_scale=softmax_scale,
626
+ causal=causal,
627
+ )
628
+
629
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
630
+ else:
631
+ attn_output = flash_attn_func(
632
+ query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
633
+ )
634
+
635
+ return attn_output
636
+
637
+ def _unpad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
638
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
639
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
640
+
641
+ key_layer = index_first_axis(
642
+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
643
+ )
644
+ value_layer = index_first_axis(
645
+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
646
+ )
647
+
648
+ if query_length == kv_seq_len:
649
+ query_layer = index_first_axis(
650
+ query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
651
+ )
652
+ cu_seqlens_q = cu_seqlens_k
653
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
654
+ indices_q = indices_k
655
+ elif query_length == 1:
656
+ max_seqlen_in_batch_q = 1
657
+ cu_seqlens_q = torch.arange(
658
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
659
+ ) # There is a memcpy here, that is very bad.
660
+ indices_q = cu_seqlens_q[:-1]
661
+ query_layer = query_layer.squeeze(1)
662
+ else:
663
+ # The -q_len: slice assumes left padding.
664
+ attention_mask = attention_mask[:, -query_length:]
665
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
666
+
667
+ return (
668
+ query_layer,
669
+ key_layer,
670
+ value_layer,
671
+ indices_q.to(torch.int64),
672
+ (cu_seqlens_q, cu_seqlens_k),
673
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
674
+ )
675
+
676
+ class InternLM2DecoderLayer(nn.Module):
677
+
678
+ def __init__(self, config: InternLM2Config):
679
+ super().__init__()
680
+ self.hidden_size = config.hidden_size
681
+ self.attention = (
682
+ InternLM2Attention(config=config)
683
+ if not getattr(config, 'attn_implementation')=="flash_attention_2" else
684
+ InternLM2FlashAttention2(config=config))
685
+ self.feed_forward = InternLM2MLP(config)
686
+ self.attention_norm = InternLM2RMSNorm(
687
+ config.hidden_size, eps=config.rms_norm_eps)
688
+ self.ffn_norm = InternLM2RMSNorm(
689
+ config.hidden_size, eps=config.rms_norm_eps)
690
+
691
+ def forward(
692
+ self,
693
+ hidden_states: torch.Tensor,
694
+ attention_mask: Optional[torch.Tensor] = None,
695
+ position_ids: Optional[torch.LongTensor] = None,
696
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
697
+ output_attentions: Optional[bool] = False,
698
+ use_cache: Optional[bool] = False,
699
+ im_mask: Optional[Tuple[torch.Tensor]] = None,
700
+ **kwargs,
701
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor,
702
+ torch.FloatTensor]]]:
703
+ """
704
+ Args:
705
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
706
+ attention_mask (`torch.FloatTensor`, *optional*):
707
+ attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
708
+ query_sequence_length, key_sequence_length)` if default attention is used.
709
+ output_attentions (`bool`, *optional*):
710
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
711
+ returned tensors for more detail.
712
+ use_cache (`bool`, *optional*):
713
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
714
+ (see `past_key_values`).
715
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
716
+ """
717
+ if 'padding_mask' in kwargs:
718
+ warnings.warn(
719
+ 'Passing `padding_mask` is deprecated and will be removed in v4.37. '
720
+ 'Please make sure use `attention_mask` instead.`')
721
+
722
+ residual = hidden_states
723
+
724
+ hidden_states = self.attention_norm(hidden_states)
725
+
726
+ # Self Attention
727
+ hidden_states, self_attn_weights, present_key_value = self.attention(
728
+ hidden_states=hidden_states,
729
+ attention_mask=attention_mask,
730
+ position_ids=position_ids,
731
+ past_key_value=past_key_value,
732
+ output_attentions=output_attentions,
733
+ use_cache=use_cache,
734
+ im_mask=im_mask,
735
+ **kwargs,
736
+ )
737
+ hidden_states = residual + hidden_states
738
+
739
+ # Fully Connected
740
+ residual = hidden_states
741
+ hidden_states = self.ffn_norm(hidden_states)
742
+ hidden_states = self.feed_forward(hidden_states, im_mask)
743
+ hidden_states = residual + hidden_states
744
+
745
+ outputs = (hidden_states, )
746
+
747
+ if output_attentions:
748
+ outputs += (self_attn_weights, )
749
+
750
+ if use_cache:
751
+ outputs += (present_key_value, )
752
+
753
+ return outputs
754
+
755
+
756
+ InternLM2_START_DOCSTRING = r"""
757
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
758
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
759
+ etc.)
760
+
761
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
762
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
763
+ and behavior.
764
+
765
+ Parameters:
766
+ config ([`InternLM2Config`]):
767
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
768
+ load the weights associated with the model, only the configuration. Check out the
769
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
770
+ """
771
+
772
+
773
+ @add_start_docstrings(
774
+ 'The bare InternLM2 Model outputting raw hidden-states without any specific head on top.',
775
+ InternLM2_START_DOCSTRING,
776
+ )
777
+ class InternLM2PreTrainedModel(PreTrainedModel):
778
+ config_class = InternLM2Config
779
+ base_model_prefix = 'model'
780
+ supports_gradient_checkpointing = True
781
+ _no_split_modules = ['InternLM2DecoderLayer']
782
+ _skip_keys_device_placement = 'past_key_values'
783
+
784
+ def _init_weights(self, module):
785
+ std = self.config.initializer_range
786
+ if isinstance(module, nn.Linear):
787
+ module.weight.data.normal_(mean=0.0, std=std)
788
+ if module.bias is not None:
789
+ module.bias.data.zero_()
790
+ elif isinstance(module, nn.Embedding):
791
+ module.weight.data.normal_(mean=0.0, std=std)
792
+ if module.padding_idx is not None:
793
+ module.weight.data[module.padding_idx].zero_()
794
+
795
+
796
+ InternLM2_INPUTS_DOCSTRING = r"""
797
+ Args:
798
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
799
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
800
+ it.
801
+
802
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
803
+ [`PreTrainedTokenizer.__call__`] for details.
804
+
805
+ [What are input IDs?](../glossary#input-ids)
806
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
807
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
808
+
809
+ - 1 for tokens that are **not masked**,
810
+ - 0 for tokens that are **masked**.
811
+
812
+ [What are attention masks?](../glossary#attention-mask)
813
+
814
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
815
+ [`PreTrainedTokenizer.__call__`] for details.
816
+
817
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
818
+ `past_key_values`).
819
+
820
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
821
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
822
+ information on the default strategy.
823
+
824
+ - 1 indicates the head is **not masked**,
825
+ - 0 indicates the head is **masked**.
826
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
827
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
828
+ config.n_positions - 1]`.
829
+
830
+ [What are position IDs?](../glossary#position-ids)
831
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or
832
+ when `config.use_cache=True`):
833
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
834
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
835
+ `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`.
836
+
837
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
838
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
839
+
840
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
841
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
842
+ of shape `(batch_size, sequence_length)`.
843
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
844
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
845
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
846
+ model's internal embedding lookup matrix.
847
+ use_cache (`bool`, *optional*):
848
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
849
+ `past_key_values`).
850
+ output_attentions (`bool`, *optional*):
851
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
852
+ tensors for more detail.
853
+ output_hidden_states (`bool`, *optional*):
854
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
855
+ more detail.
856
+ return_dict (`bool`, *optional*):
857
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
858
+ """
859
+
860
+
861
+ @add_start_docstrings(
862
+ 'The bare InternLM2 Model outputting raw hidden-states without any specific head on top.',
863
+ InternLM2_START_DOCSTRING,
864
+ )
865
+ class InternLM2Model(InternLM2PreTrainedModel):
866
+ """Transformer decoder consisting of *config.num_hidden_layers* layers.
867
+ Each layer is a [`InternLM2DecoderLayer`]
868
+
869
+ Args:
870
+ config: InternLM2Config
871
+ """
872
+
873
+ _auto_class = 'AutoModel'
874
+
875
+ def __init__(self, config: InternLM2Config):
876
+ super().__init__(config)
877
+ self.padding_idx = config.pad_token_id
878
+ self.vocab_size = config.vocab_size
879
+ self.config = config
880
+
881
+ self.tok_embeddings = nn.Embedding(config.vocab_size,
882
+ config.hidden_size,
883
+ self.padding_idx)
884
+ self.layers = nn.ModuleList([
885
+ InternLM2DecoderLayer(config)
886
+ for _ in range(config.num_hidden_layers)
887
+ ])
888
+ self.norm = InternLM2RMSNorm(
889
+ config.hidden_size, eps=config.rms_norm_eps)
890
+
891
+ self.gradient_checkpointing = False
892
+ # Initialize weights and apply final processing
893
+ self.post_init()
894
+
895
+ def get_input_embeddings(self):
896
+ return self.tok_embeddings
897
+
898
+ def set_input_embeddings(self, value):
899
+ self.tok_embeddings = value
900
+
901
+ # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
902
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape,
903
+ inputs_embeds, past_key_values_length):
904
+ # create causal mask
905
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
906
+ combined_attention_mask = None
907
+ if input_shape[-1] > 1:
908
+ combined_attention_mask = _make_causal_mask(
909
+ input_shape,
910
+ inputs_embeds.dtype,
911
+ device=inputs_embeds.device,
912
+ past_key_values_length=past_key_values_length,
913
+ )
914
+
915
+ if attention_mask is not None:
916
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
917
+ expanded_attn_mask = _expand_mask(
918
+ attention_mask, inputs_embeds.dtype,
919
+ tgt_len=input_shape[-1]).to(inputs_embeds.device)
920
+ combined_attention_mask = (
921
+ expanded_attn_mask if combined_attention_mask is None else
922
+ expanded_attn_mask + combined_attention_mask)
923
+
924
+ return combined_attention_mask
925
+
926
+ @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
927
+ def forward(self,
928
+ input_ids: torch.LongTensor = None,
929
+ attention_mask: Optional[torch.Tensor] = None,
930
+ position_ids: Optional[torch.LongTensor] = None,
931
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
932
+ inputs_embeds: Optional[torch.FloatTensor] = None,
933
+ use_cache: Optional[bool] = None,
934
+ output_attentions: Optional[bool] = None,
935
+ output_hidden_states: Optional[bool] = None,
936
+ return_dict: Optional[bool] = None,
937
+ **kwargs) -> Union[Tuple, BaseModelOutputWithPast]:
938
+
939
+ im_mask = kwargs.get('im_mask', None)
940
+
941
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
942
+ output_hidden_states = (
943
+ output_hidden_states if output_hidden_states is not None else
944
+ self.config.output_hidden_states)
945
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
946
+
947
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
948
+
949
+ if self.config.attn_implementation: _import_flash_attn()
950
+
951
+ # retrieve input_ids and inputs_embeds
952
+ if input_ids is not None and inputs_embeds is not None:
953
+ raise ValueError(
954
+ 'You cannot specify both input_ids and inputs_embeds at the same time'
955
+ )
956
+ elif input_ids is not None:
957
+ batch_size, seq_length = input_ids.shape[:2]
958
+ elif inputs_embeds is not None:
959
+ batch_size, seq_length = inputs_embeds.shape[:2]
960
+ else:
961
+ raise ValueError(
962
+ 'You have to specify either input_ids or inputs_embeds')
963
+
964
+ seq_length_with_past = seq_length
965
+ past_key_values_length = 0
966
+ if past_key_values is not None:
967
+ past_key_values_length = past_key_values[0][0].shape[2]
968
+ seq_length_with_past = seq_length_with_past + past_key_values_length
969
+
970
+ if position_ids is None:
971
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
972
+ position_ids = torch.arange(
973
+ past_key_values_length,
974
+ seq_length + past_key_values_length,
975
+ dtype=torch.long,
976
+ device=device)
977
+ position_ids = position_ids.unsqueeze(0)
978
+
979
+ if inputs_embeds is None:
980
+ inputs_embeds = self.tok_embeddings(input_ids)
981
+ im_mask = torch.zeros(inputs_embeds.shape[:2]).to(
982
+ inputs_embeds.device).bool()
983
+ if self.config.attn_implementation == "flash_attention_2":
984
+ # 2d mask is passed through the layers
985
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
986
+ else:
987
+ if attention_mask is None:
988
+ attention_mask = torch.ones(
989
+ (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
990
+ )
991
+ attention_mask = self._prepare_decoder_attention_mask(
992
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
993
+ )
994
+ # embed positions
995
+ # if attention_mask is None:
996
+ # attention_mask = torch.ones((batch_size, seq_length_with_past),
997
+ # dtype=torch.bool,
998
+ # device=inputs_embeds.device)
999
+ # attention_mask = self._prepare_decoder_attention_mask(
1000
+ # attention_mask, (batch_size, seq_length), inputs_embeds,
1001
+ # past_key_values_length)
1002
+
1003
+ # embed positions
1004
+ hidden_states = inputs_embeds
1005
+
1006
+ if self.gradient_checkpointing and self.training:
1007
+ if use_cache:
1008
+ logger.warning_once(
1009
+ '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
1010
+ )
1011
+ use_cache = False
1012
+
1013
+ # decoder layers
1014
+ all_hidden_states = () if output_hidden_states else None
1015
+ all_self_attns = () if output_attentions else None
1016
+ next_decoder_cache = () if use_cache else None
1017
+
1018
+ for idx, decoder_layer in enumerate(self.layers):
1019
+ if output_hidden_states:
1020
+ all_hidden_states += (hidden_states, )
1021
+
1022
+ past_key_value = past_key_values[
1023
+ idx] if past_key_values is not None else None
1024
+
1025
+ if self.gradient_checkpointing and self.training:
1026
+
1027
+ def create_custom_forward(module):
1028
+
1029
+ def custom_forward(*inputs):
1030
+ # None for past_key_value
1031
+ return module(*inputs, output_attentions, None,
1032
+ im_mask)
1033
+
1034
+ return custom_forward
1035
+
1036
+ layer_outputs = torch.utils.checkpoint.checkpoint(
1037
+ create_custom_forward(decoder_layer),
1038
+ hidden_states,
1039
+ attention_mask,
1040
+ position_ids,
1041
+ None,
1042
+ )
1043
+ else:
1044
+ layer_outputs = decoder_layer(
1045
+ hidden_states,
1046
+ attention_mask=attention_mask,
1047
+ position_ids=position_ids,
1048
+ past_key_value=past_key_value,
1049
+ output_attentions=output_attentions,
1050
+ use_cache=use_cache,
1051
+ im_mask=im_mask,
1052
+ )
1053
+
1054
+ hidden_states = layer_outputs[0]
1055
+
1056
+ if use_cache:
1057
+ next_decoder_cache += (
1058
+ layer_outputs[2 if output_attentions else 1], )
1059
+
1060
+ if output_attentions:
1061
+ all_self_attns += (layer_outputs[1], )
1062
+
1063
+ hidden_states = self.norm(hidden_states)
1064
+
1065
+ # add hidden states from the last decoder layer
1066
+ if output_hidden_states:
1067
+ all_hidden_states += (hidden_states, )
1068
+
1069
+ next_cache = next_decoder_cache if use_cache else None
1070
+ if not return_dict:
1071
+ return tuple(
1072
+ v for v in
1073
+ [hidden_states, next_cache, all_hidden_states, all_self_attns]
1074
+ if v is not None)
1075
+ return BaseModelOutputWithPast(
1076
+ last_hidden_state=hidden_states,
1077
+ past_key_values=next_cache,
1078
+ hidden_states=all_hidden_states,
1079
+ attentions=all_self_attns,
1080
+ )
meteor/arch/modeling_meteor.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # System
2
+ import torch
3
+ from torch import nn
4
+ from utils.utils import *
5
+ import torch.utils.checkpoint
6
+ from transformers.cache_utils import Cache
7
+ from typing import List, Optional, Tuple, Union
8
+ from .build_module import build_vision_projector, build_vision_tower
9
+ from .modeling_internlm2 import InternLM2Model, InternLM2PreTrainedModel
10
+
11
+ # Dataclass & ModelOutput
12
+ from dataclasses import dataclass
13
+ from transformers.modeling_outputs import ModelOutput
14
+ @dataclass
15
+ class MeteorCausalLMOutputWithPast(ModelOutput):
16
+ loss: Optional[torch.FloatTensor] = None
17
+ logits: torch.FloatTensor = None
18
+ past_key_values: Optional[List[torch.FloatTensor]] = None
19
+ tor_features: Optional[List[torch.FloatTensor]] = None
20
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
21
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
22
+ image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
23
+
24
+ class MeteorForCausalLM(InternLM2PreTrainedModel):
25
+ _auto_class = 'AutoModelForCausalLM'
26
+
27
+ _tied_weights_keys = ['output.weight']
28
+
29
+ def __init__(self, config):
30
+ super().__init__(config)
31
+
32
+ # Model
33
+ self.model = InternLM2Model(config)
34
+ self.vocab_size = config.vocab_size
35
+ self.output = nn.Linear(config.hidden_size, config.vocab_size-2, bias=False)
36
+ self.max_length = config.max_length
37
+
38
+ # Initialize weights and apply final processing
39
+ self.post_init()
40
+
41
+ # Vision Encoder
42
+ self.vit = build_vision_tower()
43
+
44
+ # Vision Projection
45
+ self.vision_proj = build_vision_projector()
46
+
47
+ def eval_process(
48
+ self,
49
+ inputs,
50
+ data,
51
+ tokenizer,
52
+ device,
53
+ img_token_number,
54
+ ):
55
+
56
+ batched_qa_prompt=[]
57
+ for _input in inputs:
58
+
59
+ # Visualization
60
+ # imim = _input['image'].cpu().permute(1, 2, 0)
61
+
62
+ # make question, rationale, and answer
63
+ question = make_instruction_for_eval_meteor(_input['question'], data)
64
+
65
+ # add bundle image tokens if it has <image> token
66
+ question = add_bundle_tokens(question, '<image>', img_token_number)
67
+
68
+ batched_qa_prompt.append(question)
69
+
70
+ '''For Final Outputs'''
71
+ qa_prompts = tokenizer(batched_qa_prompt, padding='longest', return_tensors="pt", add_special_tokens=False)
72
+
73
+ # [1] input_ids
74
+ input_ids = qa_prompts.input_ids.to(device)
75
+
76
+ # [2] attention_mask
77
+ attention_mask = qa_prompts.attention_mask.to(device)
78
+
79
+ # [3] im_mask
80
+ im_mask = torch.zeros_like(input_ids).bool()
81
+ im_mask[torch.where(input_ids==self.config.image_token_index)] = True
82
+
83
+ return {"input_ids": input_ids,
84
+ "attention_mask": attention_mask,
85
+ "im_mask": im_mask,
86
+ }
87
+
88
+ def clip_features(self, image):
89
+ self.vit.eval()
90
+ return self.vit(image)
91
+
92
+ def _merge_input_embeds_with_tor_features(self, tor_features, inputs_embeds, input_ids):
93
+
94
+ # batch index for image feature
95
+ batch_ind_tor_feature = 0
96
+
97
+ for ind, input_id in enumerate(input_ids):
98
+ matching = torch.where(input_id==self.config.tor_token_index)
99
+ num_tor_tokens_per_one_sample = len(matching[0])
100
+ inputs_embeds[ind][matching] = tor_features[batch_ind_tor_feature: batch_ind_tor_feature+num_tor_tokens_per_one_sample].to(inputs_embeds.dtype)
101
+ batch_ind_tor_feature += num_tor_tokens_per_one_sample
102
+
103
+ def _merge_input_embeds_with_image_features(self, image_features, inputs_embeds, input_ids):
104
+
105
+ # batch index for image feature
106
+ batch_ind_image_feature = 0
107
+
108
+ # shape of image_features
109
+ _, C, D = image_features.shape
110
+
111
+ for ind, input_id in enumerate(input_ids):
112
+ matching = torch.where(input_id==self.config.image_token_index)
113
+ num_image_tokens_per_one_sample = len(matching[0]) // C
114
+ inputs_embeds[ind][matching] = image_features[batch_ind_image_feature: batch_ind_image_feature+num_image_tokens_per_one_sample].view(-1, D)
115
+ batch_ind_image_feature += num_image_tokens_per_one_sample
116
+
117
+ def forward(
118
+ self,
119
+ input_ids: torch.LongTensor = None,
120
+ image_features: torch.FloatTensor = None,
121
+ tor_features: torch.FloatTensor = None,
122
+ attention_mask: Optional[torch.Tensor] = None,
123
+ im_mask: torch.BoolTensor = None,
124
+ position_ids: Optional[torch.LongTensor] = None,
125
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
126
+ inputs_embeds: Optional[torch.FloatTensor] = None,
127
+ labels: Optional[torch.LongTensor] = None,
128
+ use_cache: Optional[bool] = None,
129
+ output_attentions: Optional[bool] = None,
130
+ output_hidden_states: Optional[bool] = None,
131
+ return_dict: Optional[bool] = None,
132
+ ) -> Union[Tuple, MeteorCausalLMOutputWithPast]:
133
+
134
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
135
+ output_hidden_states = (
136
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
137
+ )
138
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
139
+
140
+ if inputs_embeds is None:
141
+ # 1. Extra the input embeddings
142
+ inputs_embeds = self.get_input_embeddings()(input_ids)
143
+
144
+ # 2. Merge text and images
145
+ if image_features is not None and input_ids.shape[1] != 1:
146
+ image_features = self.vision_proj(image_features.to(inputs_embeds.dtype))
147
+ self._merge_input_embeds_with_image_features(image_features, inputs_embeds, input_ids)
148
+
149
+ # 3. Merge text and <tor>
150
+ if tor_features is not None and input_ids.shape[1] != 1:
151
+ self._merge_input_embeds_with_tor_features(tor_features, inputs_embeds, input_ids)
152
+
153
+ # In case input_ids.shape[1] == 1 & image_features==None & past_key_values != None, we are in the case of
154
+ # generation with cache
155
+ elif past_key_values is not None and image_features is not None and input_ids.shape[1] == 1:
156
+ # Retrieve the first layer to inspect the logits and mask out the hidden states
157
+ # that are set to 0
158
+ first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
159
+
160
+ # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
161
+ batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
162
+
163
+ # Get the target length
164
+ target_length = input_ids.shape[1]
165
+ past_length = first_layer_past_key_value.shape[-1]
166
+
167
+ extended_attention_mask = torch.ones(
168
+ (attention_mask.shape[0], past_length),
169
+ dtype=attention_mask.dtype,
170
+ device=attention_mask.device,
171
+ )
172
+
173
+ # Filter out only the tokens that can be un-attended, this can happen
174
+ # if one uses Llava + Fused modules where the cache on the
175
+ # first iteration is already big enough, or if one passes custom cache
176
+ valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
177
+ new_batch_index = batch_index[valid_indices]
178
+ new_non_attended_tokens = non_attended_tokens[valid_indices]
179
+
180
+ # Zero-out the places where we don't need to attend
181
+ extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
182
+
183
+ attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
184
+ position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
185
+ im_mask = torch.zeros(inputs_embeds.shape[:2]).bool().to(inputs_embeds.device)
186
+
187
+ outputs = self.model(
188
+ attention_mask=attention_mask,
189
+ position_ids=position_ids,
190
+ past_key_values=past_key_values,
191
+ inputs_embeds=inputs_embeds,
192
+ use_cache=use_cache,
193
+ output_attentions=output_attentions,
194
+ output_hidden_states=output_hidden_states,
195
+ return_dict=return_dict,
196
+ im_mask=im_mask,
197
+ )
198
+
199
+ hidden_states = outputs[0]
200
+ logits = self.output(hidden_states)
201
+
202
+ loss = None
203
+ if labels is not None:
204
+ # Shift so that tokens < n predict n
205
+ if attention_mask is not None:
206
+ shift_attention_mask = attention_mask[..., 1:]
207
+ shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
208
+ shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
209
+ else:
210
+ shift_logits = logits[..., :-1, :].contiguous()
211
+ shift_labels = labels[..., 1:].contiguous()
212
+ # Flatten the tokens
213
+ loss_fct = nn.CrossEntropyLoss()
214
+ loss = loss_fct(
215
+ shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
216
+ )
217
+
218
+ if not return_dict:
219
+ output = (logits,) + outputs[1:]
220
+ return (loss,) + output if loss is not None else output
221
+
222
+ return MeteorCausalLMOutputWithPast(
223
+ loss=loss,
224
+ logits=logits,
225
+ past_key_values=outputs.past_key_values,
226
+ tor_features=hidden_states[torch.where(input_ids==self.config.tor_token_index)],
227
+ hidden_states=outputs.hidden_states,
228
+ attentions=outputs.attentions,
229
+ )
230
+
231
+ def prepare_inputs_for_generation(self,
232
+ input_ids,
233
+ past_key_values=None,
234
+ attention_mask=None,
235
+ inputs_embeds=None,
236
+ image_features=None,
237
+ tor_features=None,
238
+ im_mask=None,
239
+ **kwargs):
240
+ if past_key_values is not None:
241
+ past_length = past_key_values[0][0].shape[2]
242
+
243
+ # Some generation methods already pass only the last input ID
244
+ if input_ids.shape[1] > past_length:
245
+ remove_prefix_length = past_length
246
+ else:
247
+ # Default to old behavior: keep only final ID
248
+ remove_prefix_length = input_ids.shape[1] - 1
249
+
250
+ input_ids = input_ids[:, remove_prefix_length:]
251
+
252
+ position_ids = kwargs.get('position_ids', None)
253
+ if attention_mask is not None and position_ids is None:
254
+ # create position_ids on the fly for batch generation
255
+ position_ids = attention_mask.long().cumsum(-1) - 1
256
+ position_ids.masked_fill_(attention_mask == 0, 1)
257
+ if past_key_values:
258
+ position_ids = position_ids[:, -input_ids.shape[1]:]
259
+
260
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
261
+ if inputs_embeds is not None and past_key_values is None:
262
+ model_inputs = {"inputs_embeds": inputs_embeds}
263
+ else:
264
+ model_inputs = {"input_ids": input_ids}
265
+
266
+ model_inputs.update(
267
+ {
268
+ "position_ids": position_ids,
269
+ "past_key_values": past_key_values,
270
+ "use_cache": kwargs.get("use_cache"),
271
+ "attention_mask": attention_mask,
272
+ "image_features": image_features,
273
+ "tor_features": tor_features,
274
+ "im_mask": im_mask,
275
+ }
276
+ )
277
+ return model_inputs
278
+
279
+ @staticmethod
280
+ def _reorder_cache(past_key_values, beam_idx):
281
+ reordered_past = ()
282
+ for layer_past in past_key_values:
283
+ reordered_past += (tuple(
284
+ past_state.index_select(0, beam_idx.to(past_state.device))
285
+ for past_state in layer_past), )
286
+ return reordered_past
meteor/arch/modeling_mmamba.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Transformers
2
+ import re
3
+ import torch
4
+ from torch import nn
5
+ from utils.utils import *
6
+ from typing import Optional, Tuple, Union
7
+ from transformers import MambaForCausalLM
8
+ from transformers import LlavaNextForConditionalGeneration, LlavaForConditionalGeneration
9
+
10
+ class MambaCache:
11
+ def __init__(self, config, batch_size, dtype=torch.float16, device=None):
12
+ self.seqlen_offset = 0
13
+ self.dtype = dtype
14
+ intermediate_size = config.intermediate_size
15
+ ssm_state_size = config.state_size
16
+ conv_kernel_size = config.conv_kernel
17
+
18
+ self.conv_states = {
19
+ i: torch.zeros(batch_size, intermediate_size, conv_kernel_size, device=device, dtype=dtype)
20
+ for i in range(config.num_hidden_layers)
21
+ }
22
+ self.ssm_states = {
23
+ i: torch.zeros(batch_size, intermediate_size, ssm_state_size, device=device, dtype=dtype)
24
+ for i in range(config.num_hidden_layers)
25
+ }
26
+
27
+ # Dataclass & ModelOutput
28
+ from dataclasses import dataclass
29
+ from transformers.modeling_outputs import ModelOutput
30
+ @dataclass
31
+ class MambaCausalLMOutput(ModelOutput):
32
+ loss: Optional[torch.FloatTensor] = None
33
+ cache_params: Optional[MambaCache] = None
34
+ tor_features: Optional[torch.FloatTensor] = None
35
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
36
+
37
+ class MeteorMambaForCausalLM(MambaForCausalLM):
38
+ def __init__(self, config):
39
+ super().__init__(config)
40
+
41
+ # initialize other projections for Vision and tor
42
+ self.vision_proj = self.build_vision_projector(1024, self.config.hidden_size)
43
+ self.tor_proj = self.build_vision_projector(self.config.hidden_size, 4096)
44
+
45
+ # replacing embedding size of mamba with that of meteor
46
+ self.backbone.embeddings = nn.Embedding(num_embeddings=92546,
47
+ embedding_dim=self.config.hidden_size)
48
+
49
+ # image processing variable
50
+ self.mean = torch.tensor([0.48145466, 0.4578275, 0.40821073]).view(1,-1,1,1) * 255
51
+ self.std = torch.tensor([0.26862954, 0.26130258, 0.27577711]).view(1,-1,1,1) * 255
52
+
53
+ def image_processor(self, images):
54
+ norm_images = (images - self.mean.to(images.device)) / self.std.to(images.device)
55
+ return norm_images
56
+
57
+ @staticmethod
58
+ def build_vision_projector(mm_hidden_size, hidden_size):
59
+ projector_type = 'mlp2x_gelu'
60
+ mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
61
+ if mlp_gelu_match:
62
+ mlp_depth = int(mlp_gelu_match.group(1))
63
+ modules = [nn.Linear(mm_hidden_size, hidden_size)]
64
+ for _ in range(1, mlp_depth):
65
+ modules.append(nn.GELU())
66
+ modules.append(nn.Linear(hidden_size, hidden_size))
67
+ return nn.Sequential(*modules)
68
+
69
+ raise ValueError(f'Unknown projector type: {projector_type}')
70
+
71
+ def eval_process(
72
+ self,
73
+ inputs,
74
+ tokenizer,
75
+ device,
76
+ img_token_number,
77
+ ):
78
+ batched_image=[]
79
+ batched_qa_prompt=[]
80
+ for _input in inputs:
81
+
82
+ # Visualization
83
+ # imim = _input['image'].cpu().permute(1, 2, 0)
84
+
85
+ # adding <image> to question if not included despite being an image, and adding system prompt and <tor> prompt
86
+ if 'image' in _input.keys() and not '<image>' in _input['question']: _input['question'] = '<image>\n' + _input['question']
87
+
88
+ # make question, rationale, and answer
89
+ question = make_instruction_for_mmamba(question=_input['question'])
90
+
91
+ # add bundle image tokens if it has <image> token
92
+ question = add_bundle_tokens(question, '<image>', img_token_number)
93
+
94
+ # making batched moai prompt
95
+ if 'image' in _input.keys() and _input['image'] != None: batched_image.append(_input['image'].to(device))
96
+ batched_qa_prompt.append(question)
97
+
98
+ '''For Final Outputs'''
99
+ qa_prompts = tokenizer(batched_qa_prompt, padding='longest', return_tensors="pt", add_special_tokens=False)
100
+
101
+ # [1] input_ids
102
+ input_ids = qa_prompts.input_ids.to(device)
103
+
104
+ # image or only text?
105
+ if len(batched_image):
106
+ # [2] pixel values
107
+ try:
108
+ pixel_values = self.image_processor(torch.stack(batched_image)).to(device)
109
+ assert pixel_values.dim() == 4
110
+ except:
111
+ new_batched_image = []
112
+ for batched_image_element in batched_image:
113
+ if batched_image_element.dim() == 3:
114
+ new_batched_image.append(batched_image_element.unsqueeze(0))
115
+ else:
116
+ new_batched_image.append(batched_image_element)
117
+ pixel_values = self.image_processor(torch.cat(new_batched_image, dim=0)).to(device)
118
+
119
+ return {"input_ids": input_ids, "image": pixel_values}
120
+ else:
121
+ return {"input_ids": input_ids}
122
+
123
+
124
+ def _merge_input_embeds_with_image_features(self, image_features, inputs_embeds, input_ids):
125
+
126
+ # batch index for image feature
127
+ batch_ind_image_feature = 0
128
+
129
+ # shape of image_features
130
+ _, C, D = image_features.shape
131
+
132
+ for ind, input_id in enumerate(input_ids):
133
+ matching = torch.where(input_id==self.config.image_token_index)
134
+ num_image_tokens_per_one_sample = len(matching[0]) // C
135
+ inputs_embeds[ind][matching] = image_features[batch_ind_image_feature: batch_ind_image_feature+num_image_tokens_per_one_sample].view(-1, D)
136
+ batch_ind_image_feature += num_image_tokens_per_one_sample
137
+
138
+ def forward(
139
+ self,
140
+ input_ids: Optional[torch.LongTensor] = None,
141
+ inputs_embeds: Optional[torch.FloatTensor] = None,
142
+ image_features: Optional[torch.FloatTensor] = None,
143
+ cache_params: Optional[MambaCache] = None,
144
+ # labels: Optional[torch.LongTensor] = None,
145
+ output_hidden_states: Optional[bool] = None,
146
+ return_dict: Optional[bool] = None,
147
+ use_cache: Optional[bool] = None,
148
+ **kwargs, # for now we need this for generation
149
+ ) -> Union[Tuple, MambaCausalLMOutput]:
150
+ r"""
151
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
152
+ Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
153
+ `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
154
+ are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
155
+ """
156
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
157
+
158
+
159
+ if inputs_embeds is None:
160
+ # 1. Extra the input embeddings
161
+ inputs_embeds = self.get_input_embeddings()(input_ids)
162
+
163
+ # 2. Merge text and images
164
+ if image_features is not None and input_ids.shape[1] != 1:
165
+ image_features = self.vision_proj(image_features)
166
+ self._merge_input_embeds_with_image_features(image_features, inputs_embeds, input_ids)
167
+
168
+ mamba_outputs = self.backbone(
169
+ cache_params=cache_params,
170
+ inputs_embeds=inputs_embeds,
171
+ output_hidden_states=output_hidden_states,
172
+ return_dict=return_dict,
173
+ use_cache=use_cache,
174
+ )
175
+ hidden_states = mamba_outputs[0]
176
+
177
+ # logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()
178
+
179
+ loss = None
180
+ # if labels is not None:
181
+ # # move labels to correct device to enable model parallelism
182
+ # labels = labels.to(logits.device)
183
+ # # Shift so that tokens < n predict n
184
+ # shift_logits = logits[..., :-1, :].contiguous()
185
+ # shift_labels = labels[..., 1:].contiguous()
186
+ # # Flatten the tokens
187
+ # loss_fct = nn.CrossEntropyLoss()
188
+ # loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
189
+
190
+ # if not return_dict:
191
+ # output = (logits,) + mamba_outputs[1:]
192
+ # return ((loss,) + output) if loss is not None else output
193
+
194
+ return MambaCausalLMOutput(
195
+ loss=loss,
196
+ cache_params=mamba_outputs.cache_params,
197
+ tor_features=self.tor_proj(hidden_states[torch.where(input_ids==self.config.tor_token_index)]),
198
+ hidden_states=mamba_outputs.hidden_states,
199
+ )
200
+
201
+ def prepare_inputs_for_generation(
202
+ self, input_ids, cache_params: Optional[MambaCache] = None, inputs_embeds=None, image_features=None, **kwargs
203
+ ):
204
+ # only last token for inputs_ids if the state is passed along.
205
+ if cache_params is not None:
206
+ input_ids = input_ids[:, -1].unsqueeze(-1)
207
+
208
+ if inputs_embeds is not None and cache_params is None:
209
+ model_inputs = {"inputs_embeds": inputs_embeds, "image_features":image_features}
210
+ else:
211
+ model_inputs = {"input_ids": input_ids, "image_features":image_features}
212
+
213
+ model_inputs["cache_params"] = cache_params
214
+ return model_inputs
meteor/arch/tokenization_internlm2.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) InternLM. All rights reserved.
2
+ #
3
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
4
+ # and OPT implementations in this library. It has been modified from its
5
+ # original forms to accommodate minor architectural differences compared
6
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License");
9
+ # you may not use this file except in compliance with the License.
10
+ # You may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
+ # See the License for the specific language governing permissions and
18
+ # limitations under the License.
19
+ """Tokenization classes for IntermLM."""
20
+ import os
21
+ from shutil import copyfile
22
+ from typing import Any, Dict, List, Optional, Tuple
23
+
24
+ import sentencepiece as spm
25
+ from transformers.tokenization_utils import PreTrainedTokenizer
26
+ from transformers.utils import logging
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+ VOCAB_FILES_NAMES = {'vocab_file': './tokenizer.model'}
31
+
32
+ PRETRAINED_VOCAB_FILES_MAP = {}
33
+
34
+
35
+ class InternLM2Tokenizer(PreTrainedTokenizer):
36
+ """Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
37
+
38
+ Args:
39
+ vocab_file (`str`):
40
+ Path to the vocabulary file.
41
+ """
42
+
43
+ vocab_files_names = VOCAB_FILES_NAMES
44
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
45
+ model_input_names = ['input_ids', 'attention_mask']
46
+ _auto_class = 'AutoTokenizer'
47
+
48
+ def __init__(
49
+ self,
50
+ vocab_file,
51
+ unk_token='<unk>',
52
+ bos_token='<s>',
53
+ eos_token='</s>',
54
+ pad_token='</s>',
55
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
56
+ add_bos_token=True,
57
+ add_eos_token=False,
58
+ decode_with_prefix_space=False,
59
+ clean_up_tokenization_spaces=False,
60
+ **kwargs,
61
+ ):
62
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
63
+ self.vocab_file = vocab_file
64
+ self.add_bos_token = add_bos_token
65
+ self.add_eos_token = add_eos_token
66
+ self.decode_with_prefix_space = decode_with_prefix_space
67
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
68
+ self.sp_model.Load(vocab_file)
69
+ self._no_prefix_space_tokens = None
70
+ super().__init__(
71
+ bos_token=bos_token,
72
+ eos_token=eos_token,
73
+ unk_token=unk_token,
74
+ pad_token=pad_token,
75
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
76
+ **kwargs,
77
+ )
78
+ """ Initialization"""
79
+
80
+ @property
81
+ def no_prefix_space_tokens(self):
82
+ if self._no_prefix_space_tokens is None:
83
+ vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
84
+ self._no_prefix_space_tokens = {
85
+ i
86
+ for i, tok in enumerate(vocab) if not tok.startswith('▁')
87
+ }
88
+ return self._no_prefix_space_tokens
89
+
90
+ @property
91
+ def vocab_size(self):
92
+ """Returns vocab size."""
93
+ return self.sp_model.get_piece_size()
94
+
95
+ @property
96
+ def bos_token_id(self) -> Optional[int]:
97
+ return self.sp_model.bos_id()
98
+
99
+ @property
100
+ def eos_token_id(self) -> Optional[int]:
101
+ return self.sp_model.eos_id()
102
+
103
+ def get_vocab(self):
104
+ """Returns vocab as a dict."""
105
+ vocab = {
106
+ self.convert_ids_to_tokens(i): i
107
+ for i in range(self.vocab_size)
108
+ }
109
+ vocab.update(self.added_tokens_encoder)
110
+ return vocab
111
+
112
+ def _tokenize(self, text):
113
+ """Returns a tokenized string."""
114
+ return self.sp_model.encode(text, out_type=str)
115
+
116
+ def _convert_token_to_id(self, token):
117
+ """Converts a token (str) in an id using the vocab."""
118
+ return self.sp_model.piece_to_id(token)
119
+
120
+ def _convert_id_to_token(self, index):
121
+ """Converts an index (integer) in a token (str) using the vocab."""
122
+ token = self.sp_model.IdToPiece(index)
123
+ return token
124
+
125
+ def _maybe_add_prefix_space(self, tokens, decoded):
126
+ if tokens and tokens[0] not in self.no_prefix_space_tokens:
127
+ return ' ' + decoded
128
+ else:
129
+ return decoded
130
+
131
+ def convert_tokens_to_string(self, tokens):
132
+ """Converts a sequence of tokens (string) in a single string."""
133
+ current_sub_tokens = []
134
+ out_string = ''
135
+ prev_is_special = False
136
+ for token in tokens:
137
+ # make sure that special tokens are not decoded using sentencepiece model
138
+ if token in self.all_special_tokens:
139
+ if not prev_is_special:
140
+ out_string += ' '
141
+ out_string += self.sp_model.decode(current_sub_tokens) + token
142
+ prev_is_special = True
143
+ current_sub_tokens = []
144
+ else:
145
+ current_sub_tokens.append(token)
146
+ prev_is_special = False
147
+ out_string += self.sp_model.decode(current_sub_tokens)
148
+ out_string = self.clean_up_tokenization(out_string)
149
+ out_string = self._maybe_add_prefix_space(
150
+ tokens=tokens, decoded=out_string)
151
+ return out_string[1:]
152
+
153
+ def save_vocabulary(self,
154
+ save_directory,
155
+ filename_prefix: Optional[str] = None) -> Tuple[str]:
156
+ """Save the vocabulary and special tokens file to a directory.
157
+
158
+ Args:
159
+ save_directory (`str`):
160
+ The directory in which to save the vocabulary.
161
+
162
+ Returns:
163
+ `Tuple(str)`: Paths to the files saved.
164
+ """
165
+ if not os.path.isdir(save_directory):
166
+ logger.error(
167
+ f'Vocabulary path ({save_directory}) should be a directory')
168
+ return
169
+ out_vocab_file = os.path.join(
170
+ save_directory,
171
+ (filename_prefix + '-' if filename_prefix else '') +
172
+ VOCAB_FILES_NAMES['vocab_file'])
173
+
174
+ if os.path.abspath(self.vocab_file) != os.path.abspath(
175
+ out_vocab_file) and os.path.isfile(self.vocab_file):
176
+ copyfile(self.vocab_file, out_vocab_file)
177
+ elif not os.path.isfile(self.vocab_file):
178
+ with open(out_vocab_file, 'wb') as fi:
179
+ content_spiece_model = self.sp_model.serialized_model_proto()
180
+ fi.write(content_spiece_model)
181
+
182
+ return (out_vocab_file, )
183
+
184
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
185
+ if self.add_bos_token:
186
+ bos_token_ids = [self.bos_token_id]
187
+ else:
188
+ bos_token_ids = []
189
+
190
+ output = bos_token_ids + token_ids_0
191
+
192
+ if token_ids_1 is not None:
193
+ output = output + token_ids_1
194
+
195
+ if self.add_eos_token:
196
+ output = output + [self.eos_token_id]
197
+
198
+ return output
199
+
200
+ def get_special_tokens_mask(
201
+ self,
202
+ token_ids_0: List[int],
203
+ token_ids_1: Optional[List[int]] = None,
204
+ already_has_special_tokens: bool = False) -> List[int]:
205
+ """Retrieve sequence ids from a token list that has no special tokens
206
+ added. This method is called when adding special tokens using the
207
+ tokenizer `prepare_for_model` method.
208
+
209
+ Args:
210
+ token_ids_0 (`List[int]`):
211
+ List of IDs.
212
+ token_ids_1 (`List[int]`, *optional*):
213
+ Optional second list of IDs for sequence pairs.
214
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
215
+ Whether or not the token list is already formatted with special tokens for the model.
216
+
217
+ Returns:
218
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
219
+ """
220
+ if already_has_special_tokens:
221
+ return super().get_special_tokens_mask(
222
+ token_ids_0=token_ids_0,
223
+ token_ids_1=token_ids_1,
224
+ already_has_special_tokens=True)
225
+
226
+ if token_ids_1 is None:
227
+ return [1] + ([0] * len(token_ids_0)) + [1]
228
+ return [1] + ([0] * len(token_ids_0)) + [1, 1] + (
229
+ [0] * len(token_ids_1)) + [1]
230
+
231
+ def create_token_type_ids_from_sequences(
232
+ self,
233
+ token_ids_0: List[int],
234
+ token_ids_1: Optional[List[int]] = None) -> List[int]:
235
+ """Create a mask from the two sequences passed to be used in a
236
+ sequence-pair classification task. T5 does not make use of token type
237
+ ids, therefore a list of zeros is returned.
238
+
239
+ Args:
240
+ token_ids_0 (`List[int]`):
241
+ List of IDs.
242
+ token_ids_1 (`List[int]`, *optional*):
243
+ Optional second list of IDs for sequence pairs.
244
+
245
+ Returns:
246
+ `List[int]`: List of zeros.
247
+ """
248
+ eos = [self.eos_token_id]
249
+
250
+ if token_ids_1 is None:
251
+ return len(token_ids_0 + eos) * [0]
252
+ return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
meteor/load_meteor.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import warnings
3
+ from config import *
4
+ from transformers import BitsAndBytesConfig
5
+ from .arch.modeling_meteor import MeteorForCausalLM
6
+ from .arch.tokenization_internlm2 import InternLM2Tokenizer
7
+ warnings.filterwarnings(action='ignore')
8
+
9
+
10
+ def load_meteor(link, bits):
11
+
12
+ # huggingface model configuration
13
+ huggingface_config = {}
14
+
15
+ # Bit quantization
16
+ if bits in [4, 8]:
17
+ huggingface_config.update(dict(
18
+ torch_dtype=torch.float16,
19
+ low_cpu_mem_usage=True,
20
+ attn_implementation="flash_attention_2",
21
+ quantization_config=BitsAndBytesConfig(
22
+ load_in_4bit=bits == 4,
23
+ load_in_8bit=bits == 8,
24
+ llm_int8_skip_modules=["vit", "vision_proj", "output", "ffn"],
25
+ llm_int8_threshold=6.0,
26
+ llm_int8_has_fp16_weight=False,
27
+ bnb_4bit_compute_dtype=torch.float16,
28
+ bnb_4bit_use_double_quant=True,
29
+ bnb_4bit_quant_type='nf4'
30
+ )
31
+ ))
32
+ else:
33
+ huggingface_config.update(dict(
34
+ torch_dtype=torch.float16,
35
+ low_cpu_mem_usage=True,
36
+ attn_implementation="flash_attention_2",
37
+ ))
38
+
39
+ # loading backbone model
40
+ meteor = MeteorForCausalLM.from_pretrained(link, **huggingface_config)
41
+
42
+ # loading meteor tokenizer
43
+ # adding <image> and <tor> special token
44
+ tok_meteor = InternLM2Tokenizer.from_pretrained(link, padding_side='left')
45
+ tok_meteor.add_tokens("<image>", special_tokens=True)
46
+ tok_meteor.add_tokens("<tor>", special_tokens=True)
47
+ return meteor, tok_meteor
meteor/load_mmamba.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from .arch.modeling_mmamba import MeteorMambaForCausalLM
3
+
4
+ def load_mmamba(link):
5
+
6
+ # huggingface model configuration
7
+ huggingface_config = {}
8
+ huggingface_config.update(dict(
9
+ ignore_mismatched_sizes=True,
10
+ torch_dtype=torch.float32,
11
+ low_cpu_mem_usage=True,
12
+ ))
13
+
14
+ # Meteor Mamba Model (no fp32)
15
+ mmamba = MeteorMambaForCausalLM.from_pretrained(link, **huggingface_config)
16
+
17
+ return mmamba
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.2.2
2
+ flash-attn --no-build-isolation
3
+ transformers
4
+ bitsandbytes
5
+ accelerate
6
+ peft
7
+ pandas
8
+ pyarrow
9
+ jsonlines
10
+ wandb
11
+ einops
12
+ einops_exts
13
+ sentencepiece
14
+ causal-conv1d>=1.2.0
15
+ mamba-ssm
16
+ timm
17
+ shortuuid
18
+ matplotlib
19
+ gradio
utils/ddp_accel_bf16.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: MULTI_GPU
4
+ downcast_bf16: 'no'
5
+ gpu_ids: all
6
+ machine_rank: 0
7
+ main_training_function: main
8
+ mixed_precision: no
9
+ num_machines: 1
10
+ num_processes: 1
11
+ rdzv_backend: static
12
+ same_network: true
13
+ tpu_env: []
14
+ tpu_use_cluster: false
15
+ tpu_use_sudo: false
16
+ use_cpu: false
utils/ddp_accel_fp16.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: MULTI_GPU
4
+ downcast_bf16: 'no'
5
+ gpu_ids: all
6
+ machine_rank: 0
7
+ main_training_function: main
8
+ mixed_precision: no
9
+ num_machines: 1
10
+ num_processes: 1
11
+ rdzv_backend: static
12
+ same_network: true
13
+ tpu_env: []
14
+ tpu_use_cluster: false
15
+ tpu_use_sudo: false
16
+ use_cpu: false
utils/ds_accel_fp16.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ deepspeed_config:
4
+ gradient_accumulation_steps: 1
5
+ offload_optimizer_device: none
6
+ offload_param_device: none
7
+ zero3_init_flag: false
8
+ zero3_save_16bit_model: false
9
+ zero_stage: 3
10
+ distributed_type: DEEPSPEED
11
+ downcast_bf16: 'no'
12
+ enable_cpu_affinity: false
13
+ machine_rank: 0
14
+ main_training_function: main
15
+ mixed_precision: fp16
16
+ num_machines: 1
17
+ num_processes: 1
18
+ rdzv_backend: static
19
+ same_network: true
20
+ tpu_env: []
21
+ tpu_use_cluster: false
22
+ tpu_use_sudo: false
23
+ use_cpu: false
utils/utils.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gc
3
+ import math
4
+ import torch
5
+ import base64
6
+ import numpy as np
7
+ from config import *
8
+ import torch.nn.functional as F
9
+
10
+ def memory_optimization():
11
+ # memory deallocation
12
+ gc.collect()
13
+
14
+ # removing cache
15
+ torch.cuda.empty_cache()
16
+
17
+ def freeze_model(model):
18
+ for param in model.parameters():
19
+ param.requires_grad=False
20
+
21
+ def find_special_token(string, special_token):
22
+ start = 0
23
+ while True:
24
+ start = string.find(special_token, start)
25
+ if start == -1: return
26
+ yield start
27
+ start += len(special_token) # use start += 1 to find overlapping matches
28
+
29
+ def insert_tor(sentence, tor_count):
30
+ words = sentence.split()
31
+ gap = len(words) // (tor_count-1)
32
+
33
+ # filtering
34
+ if 0<=gap<=2:
35
+ return False
36
+
37
+ count = 0
38
+ result = ""
39
+ for i, word in enumerate(words):
40
+ if 0<i<len(words)-1:
41
+ result+=' '
42
+ if i % gap == 0 and count != tor_count-1:
43
+ result += '<tor>'
44
+ count += 1
45
+ result += word
46
+ result = result + "<tor>"
47
+ assert len(list(find_special_token(result, '<tor>'))) == tor_count
48
+ return result
49
+
50
+ def add_bundle_tokens(input_string, special_token, num):
51
+
52
+ # number of special tokens in input_string
53
+ num_special_tokens = len(list(find_special_token(input_string, special_token)))
54
+
55
+ # No special token -> return the raw
56
+ if not num_special_tokens:
57
+ return input_string
58
+
59
+ result = ""
60
+ index = 0
61
+ while index < len(input_string):
62
+ if input_string[index:index + len(special_token)] == special_token:
63
+ result += special_token * num
64
+ index += len(special_token)
65
+ else:
66
+ result += input_string[index]
67
+ index += 1
68
+
69
+ assert len(list(find_special_token(result, special_token))) == num_special_tokens * num
70
+ return result
71
+
72
+ def make_instruction_for_mmamba(question, tor=None):
73
+
74
+ if tor:
75
+ qa_prompt = make_human_string(f"<s>[UNUSED_TOKEN_146]user\n{question}[UNUSED_TOKEN_145]",
76
+ f"[UNUSED_TOKEN_146]rationale\n{tor}[UNUSED_TOKEN_145]\n</s>",
77
+ split='\n')
78
+ else:
79
+ qa_prompt = make_human_string(f"<s>[UNUSED_TOKEN_146]user\n{question}[UNUSED_TOKEN_145]",
80
+ f"[UNUSED_TOKEN_146]rationale\n"+"<tor>"*10+"[UNUSED_TOKEN_145]\n</s>",
81
+ split='\n')
82
+ return qa_prompt
83
+
84
+ def make_instruction_for_eval_meteor(question, dataset):
85
+ system_prompt = "You should give helpful answer to user based on the rationale."
86
+
87
+ if dataset != "mmmu" and dataset != "mathverse" and dataset != "hallusionbench" and dataset != "demo":
88
+ question = "<image>" + question
89
+
90
+ if dataset in ["sqa", "mmbench", "mmbench_cn", "mmbench_dev", "mmbench_cn_dev", "seed", "qbench", "ai2d", "mmstar"]:
91
+ question = question + "\nAnswer with the option's letter from the given choices directly."
92
+
93
+ elif dataset in ["vqav2", "gqa", "pope", "chartqa"]:
94
+ question = question + "\nAnswer the question using a single word or phrase."
95
+
96
+ elif dataset in ["vizwiz"]:
97
+ question = question + "\nWhen the provided information is insufficient, respond with 'Unanswerable'. Answer the question using a single word or phrase."
98
+
99
+ elif dataset in ["mmmu"]:
100
+ if "A." in question:
101
+ question = question + "\nAnswer with the option's letter from the given choices directly."
102
+ else:
103
+ question = question + "\nAnswer the question using a single word or phrase."
104
+
105
+ elif dataset in ["hallusionbench"]:
106
+ if "Please answer yes or no." not in question:
107
+ question = question + "Please answer yes or no."
108
+
109
+ qa_prompt = make_human_string("<s>"+"<tor>"*10+f"[UNUSED_TOKEN_146]system\n{system_prompt}[UNUSED_TOKEN_145]",
110
+ f"[UNUSED_TOKEN_146]user\n{question}[UNUSED_TOKEN_145]",
111
+ "[UNUSED_TOKEN_146]assistant\n",
112
+ split='\n')
113
+
114
+ return qa_prompt
115
+
116
+
117
+ def make_human_string(*args, split):
118
+ out = ''
119
+ for i, arg in enumerate(args):
120
+ out += arg
121
+ if i != len(args)-1:
122
+ out += split
123
+ return out
124
+
125
+ def get_max_new_tokens(data_name):
126
+ if data_name.lower() in ["mme", "pope", "sqa", "mmbench", "mmbench_cn", "mmbench_dev","mmbench_cn_dev", "seed", "qbench", "ai2d", "mmstar", "vqav2", "gqa", "chartqa", "hallusionbench", "textvqa", "mmmu"]:
127
+ return 5
128
+ if data_name.lower() in ["llava", "mm-vet"]:
129
+ return 1024
130
+ else:
131
+ return 128
132
+
133
+ """
134
+ Print Data Statistics
135
+ """
136
+ def print_data_statistics(data):
137
+ # name set
138
+ name_set = {'caption',
139
+ 'instruction',
140
+ 'minigemini',
141
+ 'docdownstream',
142
+ 'docreason',
143
+ 'gllava',
144
+ 'mathvision',
145
+ 'mathinstruct',
146
+ 'mathplus'}
147
+ caption = []
148
+ instruction = []
149
+ minigemini = []
150
+ docdownstream = []
151
+ docreason = []
152
+ gllava = []
153
+ mathvision = []
154
+ mathinstruct = []
155
+ mathplus = []
156
+ for d in data:
157
+ for name in name_set:
158
+ if name in d['id']:
159
+ eval(f'{name}.append(1)')
160
+ break
161
+ num_caption = sum(caption)
162
+ num_instruction = sum(instruction)
163
+ num_minigemini = sum(minigemini)
164
+ num_docdownstream = sum(docdownstream)
165
+ num_docreason = sum(docreason)
166
+ num_gllava = sum(gllava)
167
+ num_mathvision = sum(mathvision)
168
+ num_mathinstruct = sum(mathinstruct)
169
+ num_mathplus = sum(mathplus)
170
+
171
+ total_len = num_caption + num_instruction + num_minigemini + \
172
+ num_docdownstream + num_docreason + num_gllava + \
173
+ num_mathvision + num_mathinstruct + num_mathplus
174
+
175
+ print('Meteor Dataset Structure Statistics')
176
+ print(f'Total Length: {total_len}')
177
+ print('--------------------------------------------')
178
+ print(f'ShareGPT4V-Caption: {num_caption}')
179
+ print(f'ShareGPT4V-Instruction: {num_instruction}')
180
+ print(f'MiniGemini: {num_minigemini}')
181
+ print(f'DocDownstream: {num_docdownstream}')
182
+ print(f'DocReason: {num_docreason}')
183
+ print(f'GLLaVA: {num_gllava}')
184
+ print(f'MathVision: {num_mathvision}')
185
+ print(f'MathInstruct: {num_mathinstruct}')
186
+ print(f'MathPlus: {num_mathplus}')
187
+ print('--------------------------------------------')
188
+ print(f'Real-World Image: {num_caption + num_instruction}')
189
+ print(f'Document & Chart & Diagram & Sign & Symbol: {num_minigemini + num_docdownstream + num_docreason}')
190
+ print(f'Math: {num_gllava + num_mathvision + num_mathinstruct + num_mathplus}')
191
+ print(f' Math with Vision: {num_gllava + num_mathvision}')
192
+ print(f' Math with Text only: {num_mathinstruct + num_mathplus}')
193
+ print('--------------------------------------------')
194
+ print('')