Spaces:
Sleeping
Sleeping
File size: 8,812 Bytes
6957169 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 |
import os
import re
import argparse
import pandas as pd
# !pip install python-Levenshtein
from Levenshtein import distance
import sys
sys.path.append('../')
from utilities import *
def get_most_similar(prediction, choices):
"""
Use the Levenshtein distance (or edit distance) to determine which of the choices is most similar to the given prediction
"""
distances = [distance(prediction, choice) for choice in choices]
ind = distances.index(min(distances))
return choices[ind]
# return min(choices, key=lambda choice: distance(prediction, choice))
def normalize_extracted_answer(extraction, choices, question_type, answer_type, precision):
"""
Normalize the extracted answer to match the answer type
"""
if question_type == 'multi_choice':
# make sure the extraction is a string
if isinstance(extraction, str):
extraction = extraction.strip()
else:
try:
extraction = str(extraction)
except:
extraction = ""
# extract "A" from "(A) text"
letter = re.findall(r'\(([a-zA-Z])\)', extraction)
if len(letter) > 0:
extraction = letter[0].upper()
options = [chr(ord('A') + i) for i in range(len(choices))]
if extraction in options:
# convert option letter to text, e.g. "A" -> "text"
ind = options.index(extraction)
extraction = choices[ind]
else:
# select the most similar option
extraction = get_most_similar(extraction, choices)
assert extraction in choices
elif answer_type == 'integer':
try:
extraction = str(int(float(extraction)))
except:
extraction = None
elif answer_type == 'float':
try:
extraction = str(round(float(extraction), precision))
except:
extraction = None
elif answer_type == 'list':
try:
extraction = str(extraction)
except:
extraction = None
return extraction
def safe_equal(prediction, answer):
"""
Check if the prediction is equal to the answer, even if they are of different types
"""
try:
if prediction == answer:
return True
return False
except Exception as e:
print(e)
return False
def get_acc_with_contion(res_pd, key, value):
if key == 'skills':
# if value in res_pd[key]:
total_pd = res_pd[res_pd[key].apply(lambda x: value in x)]
else:
total_pd = res_pd[res_pd[key] == value]
correct_pd = total_pd[total_pd['true_false'] == True]
acc = "{:.2f}".format(len(correct_pd) / len(total_pd) * 100)
return len(correct_pd), len(total_pd), acc
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--output_dir', type=str, default='../results')
parser.add_argument('--output_file', type=str, default='output.json')
parser.add_argument('--score_file', type=str, default='scores.json')
parser.add_argument('--gt_file', type=str, default='../data/testmini.json', help='ground truth file')
parser.add_argument('--number', type=int, default=-1, help='number of problems to run')
parser.add_argument('--rerun', action='store_true', help='rerun the evaluation')
parser.add_argument('--caculate_gain', action='store_true', help='caculate the socre gains over random guess')
parser.add_argument('--random_file', type=str, default='score_random_guess.json')
args = parser.parse_args()
# args
output_file = os.path.join(args.output_dir, args.output_file)
# # quick test
# output_file = '../results/llava-llama-2-13b/output_llava_llama_2_13b.json'
# read json
print(f"Reading {output_file}...")
results = read_json(output_file)
# read ground truth
print(f"Reading {args.gt_file}...")
gts = read_json(args.gt_file)
# full pids
full_pids = list(results.keys())
if args.number > 0:
full_pids = full_pids[:min(args.number, len(full_pids))]
print("Number of testing problems:", len(full_pids))
## [1] Evaluate if the prediction is true or false
print("\nEvaluating the predictions...")
update_json_flag = False
for pid in full_pids:
problem = results[pid]
# print(problem)
if args.rerun:
if 'prediction' in problem:
del problem['prediction']
if 'true_false' in problem:
del problem['true_false']
choices = problem['choices']
question_type = problem['question_type']
answer_type = problem['answer_type']
precision = problem['precision']
extraction = problem['extraction']
if 'answer' in problem:
answer = problem['answer']
else:
answer = gts[pid]['answer']
problem['answer'] = answer
# normalize the extracted answer to match the answer type
prediction = normalize_extracted_answer(extraction, choices, question_type, answer_type, precision)
# verify the prediction is true or false
true_false = safe_equal(prediction, answer)
# update the problem
if "true_false" not in problem:
update_json_flag = True
elif true_false != problem['true_false']:
update_json_flag = True
if "prediction" not in problem:
update_json_flag = True
elif prediction != problem['prediction']:
update_json_flag = True
problem['prediction'] = prediction
problem['true_false'] = true_false
# save the updated json
if update_json_flag:
print("\n!!!Some problems are updated.!!!")
print(f"\nSaving {output_file}...")
save_json(results, output_file)
## [2] Calculate the average accuracy
total = len(full_pids)
correct = 0
for pid in full_pids:
if results[pid]['true_false']:
correct += 1
accuracy = str(round(correct / total * 100, 2))
print(f"\nCorrect: {correct}, Total: {total}, Accuracy: {accuracy}%")
scores = {"average": {"accuracy": accuracy, "correct": correct, "total": total}}
## [3] Calculate the fine-grained accuracy scores
# merge the 'metadata' attribute into the data
for pid in results:
results[pid].update(results[pid].pop('metadata'))
# convert the data to a pandas DataFrame
df = pd.DataFrame(results).T
print(len(df))
print("Number of test problems:", len(df))
# assert len(df) == 1000 # Important!!!
# asign the target keys for evaluation
target_keys = ['question_type', 'answer_type', 'language', 'source', 'category', 'task', 'context', 'grade', 'skills']
for key in target_keys:
print(f"\nType: [{key}]")
# get the unique values of the key
if key == 'skills':
# the value is a list
values = []
for i in range(len(df)):
values += df[key][i]
values = list(set(values))
else:
values = df[key].unique()
#print(values)
# calculate the accuracy for each value
scores[key] = {}
for value in values:
correct, total, acc = get_acc_with_contion(df, key, value)
if total > 0:
print(f"[{value}]: {acc}% ({correct}/{total})")
scores[key][value] = {"accuracy": acc, "correct": correct, "total": total}
# sort the scores by accuracy
scores[key] = dict(sorted(scores[key].items(), key=lambda item: float(item[1]['accuracy']), reverse=True))
# save the scores
scores_file = os.path.join(args.output_dir, args.score_file)
print(f"\nSaving {scores_file}...")
save_json(scores, scores_file)
print("\nDone!")
# [4] Calculate the score gains over random guess
if args.caculate_gain:
random_file = os.path.join(args.output_dir, args.random_file)
random_scores = json.load(open(random_file))
print("\nCalculating the score gains...")
for key in scores:
if key == 'average':
gain = round(float(scores[key]['accuracy']) - float(random_scores[key]['accuracy']), 2)
scores[key]['acc_gain'] = gain
else:
for sub_key in scores[key]:
gain = round(float(scores[key][sub_key]['accuracy']) - float(random_scores[key][sub_key]['accuracy']), 2)
scores[key][sub_key]['acc_gain'] = str(gain)
# save the score gains
print(f"\nSaving {scores_file}...")
save_json(scores, scores_file)
print("\nDone!") |