djstrong's picture
Add calc_avg.py for average score calculation and refactor task retrieval in about.py
b9262b0
raw
history blame
3.8 kB
import glob
import json
import argparse
import sys
from dataclasses import dataclass
from enum import Enum
import csv
@dataclass(frozen=True)
class Task:
benchmark: str
metric: str
col_name: str
type: str
baseline: float = 0.0
from src.about import Tasks, get_tasks
g_tasks, mc_tasks, rag_tasks, all_tasks = get_tasks()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Calculate average scores from JSON with scores')
parser.add_argument('json', type=str, help='Path to JSON file with scores')
parser.add_argument('--header', action='store_true', help='Print header')
parser.add_argument('-d', '--delimiter', type=str, default=',', help='Delimiter for CSV output')
args = parser.parse_args()
if args.json.endswith('.json'):
paths=[args.json]
else:
paths=glob.glob(args.json + '/**/results*.json', recursive=True)
print(paths)
# paths=[args.json]
results = {}
for path in paths:
print(path)
data = json.load(open(path))
for task in Tasks:
try:
# print(task.value.benchmark, task.value.baseline)
# print(data['results'][task.value.benchmark], data['results'][task.value.benchmark][task.value.metric])
results[task.value.benchmark] = data['results'][task.value.benchmark][task.value.metric]
if 'perplexity' not in task.value.metric and 'eqbench' not in task.value.metric:
results[task.value.benchmark] *= 100
# if 'perplexity' in task.metric or 'eqbench' in task.metric:
# mean_acc = np.mean(accs)
# else:
# mean_acc = np.mean(accs) * 100.0
except KeyError:
print(f'No data for {task.value.benchmark}', file=sys.stderr)
# results=data['results']
print(results)
all_tasks_wo_polqa = [task for task in all_tasks if 'polqa' not in task]
baselines = {task.value.benchmark: task.value.baseline * 100 for task in Tasks}
print(baselines)
average_old = sum([v for task, v in results.items() if v is not None and task in all_tasks_wo_polqa]) / len(
all_tasks_wo_polqa)
average = sum(
[(results.get(task, 0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in
all_tasks]) / len(all_tasks)
for task in all_tasks:
print (task, results.get(task, 0), baselines.get(task, 0))
average_g = sum(
[(results.get(task, 0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in
g_tasks]) / len(g_tasks)
average_mc = sum(
[(results.get(task, 0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in
mc_tasks]) / len(mc_tasks)
average_rag = sum(
[(results.get(task, 0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in
rag_tasks]) / len(rag_tasks)
# for task in Tasks:
# print(task.value.benchmark, task.value.baseline)
# print(data['results'][task.value.benchmark])
# print(f'Average: {average:.2f}')
# print(f'Average generate: {average_g:.2f}')
# print(f'Average multiple choice: {average_mc:.2f}')
# print(f'Average old: {average_old:.2f}')
row = [args.json, None, average, average_old, average_g, average_mc, average_rag]
for task in Tasks:
row.append(results.get(task.value.benchmark, None))
# printe headers
if args.header:
csv.writer(sys.stdout, delimiter=args.delimiter).writerow(['file', 'name', 'average', 'average_old', 'average_g', 'average_mc'] + [task.value.benchmark for task in Tasks])
# print(row)
csv.writer(sys.stdout, delimiter=args.delimiter).writerow(row)