import os import re import argparse import pandas as pd # !pip install python-Levenshtein from Levenshtein import distance import sys sys.path.append('../') from utilities import * def get_most_similar(prediction, choices): """ Use the Levenshtein distance (or edit distance) to determine which of the choices is most similar to the given prediction """ distances = [distance(prediction, choice) for choice in choices] ind = distances.index(min(distances)) return choices[ind] # return min(choices, key=lambda choice: distance(prediction, choice)) def normalize_extracted_answer(extraction, choices, question_type, answer_type, precision): """ Normalize the extracted answer to match the answer type """ if question_type == 'multi_choice': # make sure the extraction is a string if isinstance(extraction, str): extraction = extraction.strip() else: try: extraction = str(extraction) except: extraction = "" # extract "A" from "(A) text" letter = re.findall(r'\(([a-zA-Z])\)', extraction) if len(letter) > 0: extraction = letter[0].upper() options = [chr(ord('A') + i) for i in range(len(choices))] if extraction in options: # convert option letter to text, e.g. "A" -> "text" ind = options.index(extraction) extraction = choices[ind] else: # select the most similar option extraction = get_most_similar(extraction, choices) assert extraction in choices elif answer_type == 'integer': try: extraction = str(int(float(extraction))) except: extraction = None elif answer_type == 'float': try: extraction = str(round(float(extraction), precision)) except: extraction = None elif answer_type == 'list': try: extraction = str(extraction) except: extraction = None return extraction def safe_equal(prediction, answer): """ Check if the prediction is equal to the answer, even if they are of different types """ try: if prediction == answer: return True return False except Exception as e: print(e) return False def get_acc_with_contion(res_pd, key, value): if key == 'skills': # if value in res_pd[key]: total_pd = res_pd[res_pd[key].apply(lambda x: value in x)] else: total_pd = res_pd[res_pd[key] == value] correct_pd = total_pd[total_pd['true_false'] == True] acc = "{:.2f}".format(len(correct_pd) / len(total_pd) * 100) return len(correct_pd), len(total_pd), acc if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--output_dir', type=str, default='../results') parser.add_argument('--output_file', type=str, default='output.json') parser.add_argument('--score_file', type=str, default='scores.json') parser.add_argument('--gt_file', type=str, default='../data/testmini.json', help='ground truth file') parser.add_argument('--number', type=int, default=-1, help='number of problems to run') parser.add_argument('--rerun', action='store_true', help='rerun the evaluation') parser.add_argument('--caculate_gain', action='store_true', help='caculate the socre gains over random guess') parser.add_argument('--random_file', type=str, default='score_random_guess.json') args = parser.parse_args() # args output_file = os.path.join(args.output_dir, args.output_file) # # quick test # output_file = '../results/llava-llama-2-13b/output_llava_llama_2_13b.json' # read json print(f"Reading {output_file}...") results = read_json(output_file) # read ground truth print(f"Reading {args.gt_file}...") gts = read_json(args.gt_file) # full pids full_pids = list(results.keys()) if args.number > 0: full_pids = full_pids[:min(args.number, len(full_pids))] print("Number of testing problems:", len(full_pids)) ## [1] Evaluate if the prediction is true or false print("\nEvaluating the predictions...") update_json_flag = False for pid in full_pids: problem = results[pid] # print(problem) if args.rerun: if 'prediction' in problem: del problem['prediction'] if 'true_false' in problem: del problem['true_false'] choices = problem['choices'] question_type = problem['question_type'] answer_type = problem['answer_type'] precision = problem['precision'] extraction = problem['extraction'] if 'answer' in problem: answer = problem['answer'] else: answer = gts[pid]['answer'] problem['answer'] = answer # normalize the extracted answer to match the answer type prediction = normalize_extracted_answer(extraction, choices, question_type, answer_type, precision) # verify the prediction is true or false true_false = safe_equal(prediction, answer) # update the problem if "true_false" not in problem: update_json_flag = True elif true_false != problem['true_false']: update_json_flag = True if "prediction" not in problem: update_json_flag = True elif prediction != problem['prediction']: update_json_flag = True problem['prediction'] = prediction problem['true_false'] = true_false # save the updated json if update_json_flag: print("\n!!!Some problems are updated.!!!") print(f"\nSaving {output_file}...") save_json(results, output_file) ## [2] Calculate the average accuracy total = len(full_pids) correct = 0 for pid in full_pids: if results[pid]['true_false']: correct += 1 accuracy = str(round(correct / total * 100, 2)) print(f"\nCorrect: {correct}, Total: {total}, Accuracy: {accuracy}%") scores = {"average": {"accuracy": accuracy, "correct": correct, "total": total}} ## [3] Calculate the fine-grained accuracy scores # merge the 'metadata' attribute into the data for pid in results: results[pid].update(results[pid].pop('metadata')) # convert the data to a pandas DataFrame df = pd.DataFrame(results).T print(len(df)) print("Number of test problems:", len(df)) # assert len(df) == 1000 # Important!!! # asign the target keys for evaluation target_keys = ['question_type', 'answer_type', 'language', 'source', 'category', 'task', 'context', 'grade', 'skills'] for key in target_keys: print(f"\nType: [{key}]") # get the unique values of the key if key == 'skills': # the value is a list values = [] for i in range(len(df)): values += df[key][i] values = list(set(values)) else: values = df[key].unique() #print(values) # calculate the accuracy for each value scores[key] = {} for value in values: correct, total, acc = get_acc_with_contion(df, key, value) if total > 0: print(f"[{value}]: {acc}% ({correct}/{total})") scores[key][value] = {"accuracy": acc, "correct": correct, "total": total} # sort the scores by accuracy scores[key] = dict(sorted(scores[key].items(), key=lambda item: float(item[1]['accuracy']), reverse=True)) # save the scores scores_file = os.path.join(args.output_dir, args.score_file) print(f"\nSaving {scores_file}...") save_json(scores, scores_file) print("\nDone!") # [4] Calculate the score gains over random guess if args.caculate_gain: random_file = os.path.join(args.output_dir, args.random_file) random_scores = json.load(open(random_file)) print("\nCalculating the score gains...") for key in scores: if key == 'average': gain = round(float(scores[key]['accuracy']) - float(random_scores[key]['accuracy']), 2) scores[key]['acc_gain'] = gain else: for sub_key in scores[key]: gain = round(float(scores[key][sub_key]['accuracy']) - float(random_scores[key][sub_key]['accuracy']), 2) scores[key][sub_key]['acc_gain'] = str(gain) # save the score gains print(f"\nSaving {scores_file}...") save_json(scores, scores_file) print("\nDone!")