Spaces:
AIR-Bench
/
Running on CPU Upgrade

leaderboard / src /benchmarks.py
nan's picture
feat: seperate the qa and longdoc tasks
9134169
raw
history blame
3.72 kB
from dataclasses import dataclass
from enum import Enum
def get_safe_name(name: str):
"""Get RFC 1123 compatible safe name"""
name = name.replace('-', '_')
return ''.join(
character.lower()
for character in name
if (character.isalnum() or character == '_'))
dataset_dict = {
"qa": {
"wiki": {
"en": ["wikipedia_20240101", ],
"zh": ["wikipedia_20240101", ]
},
"web": {
"en": ["mC4", ],
"zh": ["mC4", ]
},
"news": {
"en": ["CC-News", ],
"zh": ["CC-News", ]
},
"health": {
"en": ["PubMedQA", ],
"zh": ["Huatuo-26M", ]
},
"law": {
"en": ["pile-of-law", ],
"zh": ["flk_npc_gov_cn", ]
},
"finance": {
"en": ["Reuters-Financial", ],
"zh": ["FinCorpus", ]
},
"arxiv": {
"en": ["Arxiv", ]},
},
"long_doc": {
"arxiv": {
"en": ["gpt-3", "llama2", "llm-survey", "gemini"],
},
"book": {
"en": [
"origin-of-species_darwin",
"a-brief-history-of-time_stephen-hawking"
]
},
"healthcare": {
"en": [
"pubmed_100K-200K_1",
"pubmed_100K-200K_2",
"pubmed_100K-200K_3",
"pubmed_40K-50K_5-merged",
"pubmed_30K-40K_10-merged"
]
},
"law": {
"en": [
"lex_files_300K-400K",
"lex_files_400K-500K",
"lex_files_500K-600K",
"lex_files_600K-700K"
]
}
}
}
metric_list = [
"ndcg_at_1",
"ndcg_at_3",
"ndcg_at_5",
"ndcg_at_10",
"ndcg_at_100",
"ndcg_at_1000",
"map_at_1",
"map_at_3",
"map_at_5",
"map_at_10",
"map_at_100",
"map_at_1000",
"recall_at_1",
"recall_at_3",
"recall_at_5",
"recall_at_10"
"recall_at_100",
"recall_at_1000",
"precision_at_1",
"precision_at_3",
"precision_at_5",
"precision_at_10",
"precision_at_100",
"precision_at_1000",
"mrr_at_1",
"mrr_at_3",
"mrr_at_5",
"mrr_at_10",
"mrr_at_100",
"mrr_at_1000"
]
@dataclass
class Benchmark:
name: str # [task]_[domain]_[language]_[metric], task_key in the json file,
metric: str # ndcg_at_1 ,metric_key in the json file
col_name: str # [domain]_[language], name to display in the leaderboard
qa_benchmark_dict = {}
long_doc_benchmark_dict = {}
for task, domain_dict in dataset_dict.items():
for domain, lang_dict in domain_dict.items():
for lang, dataset_list in lang_dict.items():
if task == "qa":
benchmark_name = f"{task}_{domain}_{lang}"
benchmark_name = get_safe_name(benchmark_name)
col_name = f"{domain}_{lang}"
for metric in dataset_list:
qa_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name)
elif task == "long_doc":
for dataset in dataset_list:
col_name = f"{domain}_{lang}_{dataset}"
for metric in metric_list:
benchmark_name = f"{task}_{domain}_{lang}_{dataset}_{metric}"
benchmark_name = get_safe_name(benchmark_name)
long_doc_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name)
BenchmarksQA = Enum('BenchmarksQA', qa_benchmark_dict)
BenchmarksLongDoc = Enum('BenchmarksLongDoc', long_doc_benchmark_dict)