Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 7,431 Bytes
b9262b0 49c50f6 b9262b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import glob
import json
import argparse
import sys
from dataclasses import dataclass
from enum import Enum
import csv
@dataclass(frozen=True)
class Task:
benchmark: str
metric: str
col_name: str
type: str
baseline: float = 0.0
# from src.about import Tasks, get_tasks
class Tasks(Enum):
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
# task2 = Task("belebele_pol_Latn", "acc,none", "belebele_pol_Latn", "multiple_choice", 0.279)
task3 = Task("polemo2_in", "exact_match,score-first", "polemo2-in_g", "generate_until", 0.416)
task4 = Task("polemo2_in_multiple_choice", "acc,none", "polemo2-in_mc", "multiple_choice", 0.416)
task5 = Task("polemo2_out", "exact_match,score-first", "polemo2-out_g", "generate_until", 0.368)
task6 = Task("polemo2_out_multiple_choice", "acc,none", "polemo2-out_mc", "multiple_choice", 0.368)
task7 = Task("polish_8tags_multiple_choice", "acc,none", "8tags_mc", "multiple_choice", 0.143)
task8 = Task("polish_8tags_regex", "exact_match,score-first", "8tags_g", "generate_until", 0.143)
task9a = Task("polish_belebele_mc", "acc,none", "belebele_mc", "multiple_choice", 0.279)
task9 = Task("polish_belebele_regex", "exact_match,score-first", "belebele_g", "generate_until", 0.279)
task10 = Task("polish_dyk_multiple_choice", "f1,none", "dyk_mc", "multiple_choice", 0.289)
task11 = Task("polish_dyk_regex", "f1,score-first", "dyk_g", "generate_until", 0.289)
task12 = Task("polish_ppc_multiple_choice", "acc,none", "ppc_mc", "multiple_choice", 0.419)
task13 = Task("polish_ppc_regex", "exact_match,score-first", "ppc_g", "generate_until", 0.419)
task14 = Task("polish_psc_multiple_choice", "f1,none", "psc_mc", "multiple_choice", 0.466)
task15 = Task("polish_psc_regex", "f1,score-first", "psc_g", "generate_until", 0.466)
task16 = Task("polish_cbd_multiple_choice", "f1,none", "cbd_mc", "multiple_choice", 0.149)
task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until", 0.149)
task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice", 0.343)
task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until", 0.343)
task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "multiple_choice", 0.5335588952710677) # multiple_choice
task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "generate_until", 0.0) # generate_until
task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "generate_until", 0.0) # generate_until
task24 = Task("polish_poquad_open_book", "levenshtein,none", "poquad_open_book", "generate_until", 0.0)
task25 = Task("polish_eq_bench_first_turn", "first_eqbench,none", "eq_bench_first_turn", "generate_until", 0.0)
task26 = Task("polish_eq_bench", "average_eqbench,none", "eq_bench", "other", 0.0)
task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other")
task27 = Task("polish_poquad_reranking", "acc,none", "poquad_reranking", "other", 0.0)
task28 = Task("polish_abstractive_poquad_rag", "levenshtein,none", "abstractive_poquad_rag", "other", 0.0)
task29 = Task("polish_abstractive_poquad_open_book", "levenshtein,none", "abstractive_poquad_open_book", "other", 0.0)
task30 = Task("polish_pes", "exact_match,score-first", "pes", "other", 0.2)
def get_tasks():
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
rag_tasks = ['polish_polqa_reranking_multiple_choice', 'polish_polqa_open_book', 'polish_poquad_open_book']
all_tasks = g_tasks + mc_tasks
return g_tasks, mc_tasks, rag_tasks, all_tasks
g_tasks, mc_tasks, rag_tasks, all_tasks = get_tasks()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Calculate average scores from JSON with scores')
parser.add_argument('json', type=str, help='Path to JSON file with scores')
parser.add_argument('--header', action='store_true', help='Print header')
parser.add_argument('-d', '--delimiter', type=str, default=',', help='Delimiter for CSV output')
args = parser.parse_args()
if args.json.endswith('.json'):
paths=[args.json]
else:
paths=glob.glob(args.json + '/**/results*.json', recursive=True)
print(paths)
# paths=[args.json]
results = {}
for path in paths:
print(path)
data = json.load(open(path))
for task in Tasks:
try:
# print(task.value.benchmark, task.value.baseline)
# print(data['results'][task.value.benchmark], data['results'][task.value.benchmark][task.value.metric])
results[task.value.benchmark] = data['results'][task.value.benchmark][task.value.metric]
if 'perplexity' not in task.value.metric and 'eqbench' not in task.value.metric:
results[task.value.benchmark] *= 100
# if 'perplexity' in task.metric or 'eqbench' in task.metric:
# mean_acc = np.mean(accs)
# else:
# mean_acc = np.mean(accs) * 100.0
except KeyError:
print(f'No data for {task.value.benchmark}', file=sys.stderr)
# results=data['results']
print(results)
all_tasks_wo_polqa = [task for task in all_tasks if 'polqa' not in task]
baselines = {task.value.benchmark: task.value.baseline * 100 for task in Tasks}
print(baselines)
average_old = sum([v for task, v in results.items() if v is not None and task in all_tasks_wo_polqa]) / len(
all_tasks_wo_polqa)
average = sum(
[(results.get(task, 0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in
all_tasks]) / len(all_tasks)
for task in all_tasks:
print (task, results.get(task, 0), baselines.get(task, 0))
average_g = sum(
[(results.get(task, 0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in
g_tasks]) / len(g_tasks)
average_mc = sum(
[(results.get(task, 0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in
mc_tasks]) / len(mc_tasks)
average_rag = sum(
[(results.get(task, 0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in
rag_tasks]) / len(rag_tasks)
# for task in Tasks:
# print(task.value.benchmark, task.value.baseline)
# print(data['results'][task.value.benchmark])
# print(f'Average: {average:.2f}')
# print(f'Average generate: {average_g:.2f}')
# print(f'Average multiple choice: {average_mc:.2f}')
# print(f'Average old: {average_old:.2f}')
row = [args.json, None, average, average_old, average_g, average_mc, average_rag]
for task in Tasks:
row.append(results.get(task.value.benchmark, None))
# printe headers
if args.header:
csv.writer(sys.stdout, delimiter=args.delimiter).writerow(['file', 'name', 'average', 'average_old', 'average_g', 'average_mc'] + [task.value.benchmark for task in Tasks])
# print(row)
csv.writer(sys.stdout, delimiter=args.delimiter).writerow(row)
|