Initial submission
Browse files- README.md +2 -2
- app.py +2 -2
- chumor_leaderboard_submission/result.csv +12 -0
- utils.py +13 -20
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: 🥇
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
@@ -43,4 +43,4 @@ If you encounter problem on the space, don't hesitate to restart it to remove th
|
|
43 |
You'll find
|
44 |
- the main table' columns names and properties in `src/display/utils.py`
|
45 |
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
46 |
-
-
|
|
|
1 |
---
|
2 |
+
title: Chumor
|
3 |
emoji: 🥇
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
|
|
43 |
You'll find
|
44 |
- the main table' columns names and properties in `src/display/utils.py`
|
45 |
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
46 |
+
- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
app.py
CHANGED
@@ -6,7 +6,7 @@ def update_table(query, min_size, max_size, selected_subjects=None):
|
|
6 |
df = get_df()
|
7 |
filtered_df = search_and_filter_models(df, query, min_size, max_size)
|
8 |
if selected_subjects and len(selected_subjects) > 0:
|
9 |
-
base_columns = ['Models', 'Model Size(B)', 'Data Source', '
|
10 |
selected_columns = base_columns + selected_subjects
|
11 |
filtered_df = filtered_df[selected_columns]
|
12 |
return filtered_df
|
@@ -54,7 +54,7 @@ with gr.Blocks() as block:
|
|
54 |
)
|
55 |
|
56 |
# 添加学科选择器
|
57 |
-
subject_choices = [col for col in COLUMN_NAMES if col not in ['Models', 'Model Size(B)', 'Data Source', '
|
58 |
with gr.Row():
|
59 |
subjects_select = gr.CheckboxGroup(
|
60 |
choices=subject_choices,
|
|
|
6 |
df = get_df()
|
7 |
filtered_df = search_and_filter_models(df, query, min_size, max_size)
|
8 |
if selected_subjects and len(selected_subjects) > 0:
|
9 |
+
base_columns = ['Models', 'Model Size(B)', 'Data Source', 'DP Acc']
|
10 |
selected_columns = base_columns + selected_subjects
|
11 |
filtered_df = filtered_df[selected_columns]
|
12 |
return filtered_df
|
|
|
54 |
)
|
55 |
|
56 |
# 添加学科选择器
|
57 |
+
subject_choices = [col for col in COLUMN_NAMES if col not in ['Models', 'Model Size(B)', 'Data Source', 'DP Acc']]
|
58 |
with gr.Row():
|
59 |
subjects_select = gr.CheckboxGroup(
|
60 |
choices=subject_choices,
|
chumor_leaderboard_submission/result.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Models,Model Size(B),Data Source,DP Acc,DP False Positive Rate,DP False Negative Score,DP MCC,CoT Acc,CoT False Positive Rate,CoT False Negative Score,CoT MCC
|
2 |
+
Human,-,[LIT Lab](https://arxiv.org/abs/2406.12754),78.30,,,0.60,78.30,,,0.60
|
3 |
+
Athene,70B,[LIT Lab](https://arxiv.org/abs/2406.12754),44.59,97.83,0.28,0.08,47.26,91.10,2.89,0.12
|
4 |
+
ERNIE-4-turbo,-,[LIT Lab](https://arxiv.org/abs/2406.12754),60.29,59.83,13.57,0.29,45.16,96.93,0.14,0.11
|
5 |
+
Gemini-1.5-pro,-,[LIT Lab](https://arxiv.org/abs/2406.12754),54.00,77.42,5.17,0.24,60.32,33.81,47.31,0.19
|
6 |
+
GLM-4-plus,-,[LIT Lab](https://arxiv.org/abs/2406.12754),55.56,72.28,8.26,0.24,58.13,32.96,53.44,0.14
|
7 |
+
GPT-4-turbo,-,[LIT Lab](https://arxiv.org/abs/2406.12754),52.32,79.28,6.61,0.20,51.27,80.87,6.96,0.17
|
8 |
+
GPT-4o,unk,[LIT Lab](https://arxiv.org/abs/2406.12754),51.87,80.02,6.68,0.19,50.64,85.00,3.03,0.20
|
9 |
+
Nemontron,70B,[LIT Lab](https://arxiv.org/abs/2406.12754),56.30,61.26,20.87,0.19,57.17,40.28,46.14,0.14
|
10 |
+
Mistral,123B,[LIT Lab](https://arxiv.org/abs/2406.12754),55.56,69.26,12.19,0.22,51.18,79.92,8.40,0.16
|
11 |
+
QWen-2.5,72B,[LIT Lab](https://arxiv.org/abs/2406.12754),48.46,90.67,0.69,0.19,49.45,86.91,3.31,0.17
|
12 |
+
Yi,34B,[LIT Lab](https://arxiv.org/abs/2406.12754),44.95,97.24,0.21,0.10,47.17,89.30,5.44,0.09
|
utils.py
CHANGED
@@ -8,30 +8,27 @@ from huggingface_hub import Repository
|
|
8 |
|
9 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
10 |
|
11 |
-
SUBJECTS = ["Biology", "Business", "Chemistry", "Computer Science", "Economics", "Engineering",
|
12 |
-
"Health", "History", "Law", "Math", "Philosophy", "Physics", "Psychology", "Other"]
|
13 |
|
14 |
MODEL_INFO = [
|
15 |
"Models", "Model Size(B)", "Data Source",
|
16 |
-
"
|
17 |
-
"
|
18 |
-
|
|
|
19 |
|
20 |
-
DATA_TITLE_TYPE = ['markdown', 'str', 'markdown',
|
21 |
-
'number', 'number', 'number', 'number', 'number', 'number',
|
22 |
-
'number', 'number', 'number', 'number', 'number', 'number', 'number',
|
23 |
-
'number']
|
24 |
|
25 |
-
SUBMISSION_NAME = "
|
26 |
-
SUBMISSION_URL = os.path.join("https://huggingface.co/
|
27 |
-
CSV_DIR = "./
|
28 |
|
29 |
COLUMN_NAMES = MODEL_INFO
|
30 |
|
31 |
-
LEADERBOARD_INTRODUCTION = """#
|
32 |
|
33 |
## Introduction
|
34 |
-
We introduce
|
35 |
|
36 |
Note: For inclusion in our leaderboard, submissions must provide substantial evidence demonstrating that their system is a genuine language model. We maintain strict verification standards to ensure the integrity and comparability of the results.
|
37 |
|
@@ -116,8 +113,6 @@ def add_new_eval(
|
|
116 |
upload_data = json.loads(input_file)
|
117 |
print("upload_data:\n", upload_data)
|
118 |
data_row = [f'{upload_data["Model"]}', upload_data['Overall']]
|
119 |
-
for subject in SUBJECTS:
|
120 |
-
data_row += [upload_data[subject]]
|
121 |
print("data_row:\n", data_row)
|
122 |
submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
|
123 |
use_auth_token=HF_TOKEN, repo_type="dataset")
|
@@ -194,7 +189,7 @@ def get_size_range(df):
|
|
194 |
|
195 |
|
196 |
def process_model_size(size):
|
197 |
-
if pd.isna(size) or size == 'unk':
|
198 |
return 'unknown'
|
199 |
try:
|
200 |
val = float(size)
|
@@ -207,12 +202,10 @@ def filter_columns_by_subjects(df, selected_subjects=None):
|
|
207 |
if selected_subjects is None or len(selected_subjects) == 0:
|
208 |
return df[COLUMN_NAMES]
|
209 |
|
210 |
-
base_columns = ['Models', 'Model Size(B)', 'Data Source', '
|
211 |
selected_columns = base_columns + selected_subjects
|
212 |
|
213 |
available_columns = [col for col in selected_columns if col in df.columns]
|
214 |
return df[available_columns]
|
215 |
|
216 |
-
def get_subject_choices():
|
217 |
-
return SUBJECTS
|
218 |
|
|
|
8 |
|
9 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
10 |
|
|
|
|
|
11 |
|
12 |
MODEL_INFO = [
|
13 |
"Models", "Model Size(B)", "Data Source",
|
14 |
+
"DP Acc", "DP False Positive Rate", "DP False Negative Score", "DP MCC",
|
15 |
+
"CoT Acc", "CoT False Positive Rate", "CoT False Negative Score", "CoT MCC"
|
16 |
+
]
|
17 |
+
|
18 |
|
19 |
+
DATA_TITLE_TYPE = ['markdown', 'str', 'markdown',
|
20 |
+
'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
|
|
|
|
|
21 |
|
22 |
+
SUBMISSION_NAME = "chumor_leaderboard_submission"
|
23 |
+
SUBMISSION_URL = os.path.join("https://huggingface.co/spaces/dnaihao/", SUBMISSION_NAME)
|
24 |
+
CSV_DIR = "./chumor_leaderboard_submission/results.csv"
|
25 |
|
26 |
COLUMN_NAMES = MODEL_INFO
|
27 |
|
28 |
+
LEADERBOARD_INTRODUCTION = """# Chumor Leaderboard
|
29 |
|
30 |
## Introduction
|
31 |
+
We introduce Chumor, an enhanced benchmark designed to evaluate language understanding models across broader and more challenging tasks. Building on the Massive Multitask Language Understanding (MMLU) dataset, MMLU-Pro integrates more challenging, reasoning-focused questions and increases the answer choices per question from four to ten, significantly raising the difficulty and reducing the chance of success through random guessing. MMLU-Pro comprises over 12,000 rigorously curated questions from academic exams and textbooks, spanning 14 diverse domains including Biology, Business, Chemistry, Computer Science, Economics, Engineering, Health, History, Law, Math, Philosophy, Physics, Psychology, and Others.
|
32 |
|
33 |
Note: For inclusion in our leaderboard, submissions must provide substantial evidence demonstrating that their system is a genuine language model. We maintain strict verification standards to ensure the integrity and comparability of the results.
|
34 |
|
|
|
113 |
upload_data = json.loads(input_file)
|
114 |
print("upload_data:\n", upload_data)
|
115 |
data_row = [f'{upload_data["Model"]}', upload_data['Overall']]
|
|
|
|
|
116 |
print("data_row:\n", data_row)
|
117 |
submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
|
118 |
use_auth_token=HF_TOKEN, repo_type="dataset")
|
|
|
189 |
|
190 |
|
191 |
def process_model_size(size):
|
192 |
+
if pd.isna(size) or size == 'unk' or size == "-":
|
193 |
return 'unknown'
|
194 |
try:
|
195 |
val = float(size)
|
|
|
202 |
if selected_subjects is None or len(selected_subjects) == 0:
|
203 |
return df[COLUMN_NAMES]
|
204 |
|
205 |
+
base_columns = ['Models', 'Model Size(B)', 'Data Source', 'DP Acc']
|
206 |
selected_columns = base_columns + selected_subjects
|
207 |
|
208 |
available_columns = [col for col in selected_columns if col in df.columns]
|
209 |
return df[available_columns]
|
210 |
|
|
|
|
|
211 |
|