dnaihao commited on
Commit
5d9b035
·
1 Parent(s): 39793ac

Initial submission

Browse files
Files changed (4) hide show
  1. README.md +2 -2
  2. app.py +2 -2
  3. chumor_leaderboard_submission/result.csv +12 -0
  4. utils.py +13 -20
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: MMLU Pro
3
  emoji: 🥇
4
  colorFrom: green
5
  colorTo: indigo
@@ -43,4 +43,4 @@ If you encounter problem on the space, don't hesitate to restart it to remove th
43
  You'll find
44
  - the main table' columns names and properties in `src/display/utils.py`
45
  - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
46
- - teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
 
1
  ---
2
+ title: Chumor
3
  emoji: 🥇
4
  colorFrom: green
5
  colorTo: indigo
 
43
  You'll find
44
  - the main table' columns names and properties in `src/display/utils.py`
45
  - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
46
+ - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
app.py CHANGED
@@ -6,7 +6,7 @@ def update_table(query, min_size, max_size, selected_subjects=None):
6
  df = get_df()
7
  filtered_df = search_and_filter_models(df, query, min_size, max_size)
8
  if selected_subjects and len(selected_subjects) > 0:
9
- base_columns = ['Models', 'Model Size(B)', 'Data Source', 'Overall']
10
  selected_columns = base_columns + selected_subjects
11
  filtered_df = filtered_df[selected_columns]
12
  return filtered_df
@@ -54,7 +54,7 @@ with gr.Blocks() as block:
54
  )
55
 
56
  # 添加学科选择器
57
- subject_choices = [col for col in COLUMN_NAMES if col not in ['Models', 'Model Size(B)', 'Data Source', 'Overall']]
58
  with gr.Row():
59
  subjects_select = gr.CheckboxGroup(
60
  choices=subject_choices,
 
6
  df = get_df()
7
  filtered_df = search_and_filter_models(df, query, min_size, max_size)
8
  if selected_subjects and len(selected_subjects) > 0:
9
+ base_columns = ['Models', 'Model Size(B)', 'Data Source', 'DP Acc']
10
  selected_columns = base_columns + selected_subjects
11
  filtered_df = filtered_df[selected_columns]
12
  return filtered_df
 
54
  )
55
 
56
  # 添加学科选择器
57
+ subject_choices = [col for col in COLUMN_NAMES if col not in ['Models', 'Model Size(B)', 'Data Source', 'DP Acc']]
58
  with gr.Row():
59
  subjects_select = gr.CheckboxGroup(
60
  choices=subject_choices,
chumor_leaderboard_submission/result.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Models,Model Size(B),Data Source,DP Acc,DP False Positive Rate,DP False Negative Score,DP MCC,CoT Acc,CoT False Positive Rate,CoT False Negative Score,CoT MCC
2
+ Human,-,[LIT Lab](https://arxiv.org/abs/2406.12754),78.30,,,0.60,78.30,,,0.60
3
+ Athene,70B,[LIT Lab](https://arxiv.org/abs/2406.12754),44.59,97.83,0.28,0.08,47.26,91.10,2.89,0.12
4
+ ERNIE-4-turbo,-,[LIT Lab](https://arxiv.org/abs/2406.12754),60.29,59.83,13.57,0.29,45.16,96.93,0.14,0.11
5
+ Gemini-1.5-pro,-,[LIT Lab](https://arxiv.org/abs/2406.12754),54.00,77.42,5.17,0.24,60.32,33.81,47.31,0.19
6
+ GLM-4-plus,-,[LIT Lab](https://arxiv.org/abs/2406.12754),55.56,72.28,8.26,0.24,58.13,32.96,53.44,0.14
7
+ GPT-4-turbo,-,[LIT Lab](https://arxiv.org/abs/2406.12754),52.32,79.28,6.61,0.20,51.27,80.87,6.96,0.17
8
+ GPT-4o,unk,[LIT Lab](https://arxiv.org/abs/2406.12754),51.87,80.02,6.68,0.19,50.64,85.00,3.03,0.20
9
+ Nemontron,70B,[LIT Lab](https://arxiv.org/abs/2406.12754),56.30,61.26,20.87,0.19,57.17,40.28,46.14,0.14
10
+ Mistral,123B,[LIT Lab](https://arxiv.org/abs/2406.12754),55.56,69.26,12.19,0.22,51.18,79.92,8.40,0.16
11
+ QWen-2.5,72B,[LIT Lab](https://arxiv.org/abs/2406.12754),48.46,90.67,0.69,0.19,49.45,86.91,3.31,0.17
12
+ Yi,34B,[LIT Lab](https://arxiv.org/abs/2406.12754),44.95,97.24,0.21,0.10,47.17,89.30,5.44,0.09
utils.py CHANGED
@@ -8,30 +8,27 @@ from huggingface_hub import Repository
8
 
9
  HF_TOKEN = os.environ.get("HF_TOKEN")
10
 
11
- SUBJECTS = ["Biology", "Business", "Chemistry", "Computer Science", "Economics", "Engineering",
12
- "Health", "History", "Law", "Math", "Philosophy", "Physics", "Psychology", "Other"]
13
 
14
  MODEL_INFO = [
15
  "Models", "Model Size(B)", "Data Source",
16
- "Overall",
17
- "Biology", "Business", "Chemistry", "Computer Science", "Economics", "Engineering",
18
- "Health", "History", "Law", "Math", "Philosophy", "Physics", "Psychology", "Other"]
 
19
 
20
- DATA_TITLE_TYPE = ['markdown', 'str', 'markdown', 'number',
21
- 'number', 'number', 'number', 'number', 'number', 'number',
22
- 'number', 'number', 'number', 'number', 'number', 'number', 'number',
23
- 'number']
24
 
25
- SUBMISSION_NAME = "mmlu_pro_leaderboard_submission"
26
- SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/TIGER-Lab/", SUBMISSION_NAME)
27
- CSV_DIR = "./mmlu_pro_leaderboard_submission/results.csv"
28
 
29
  COLUMN_NAMES = MODEL_INFO
30
 
31
- LEADERBOARD_INTRODUCTION = """# MMLU-Pro Leaderboard
32
 
33
  ## Introduction
34
- We introduce MMLU-Pro, an enhanced benchmark designed to evaluate language understanding models across broader and more challenging tasks. Building on the Massive Multitask Language Understanding (MMLU) dataset, MMLU-Pro integrates more challenging, reasoning-focused questions and increases the answer choices per question from four to ten, significantly raising the difficulty and reducing the chance of success through random guessing. MMLU-Pro comprises over 12,000 rigorously curated questions from academic exams and textbooks, spanning 14 diverse domains including Biology, Business, Chemistry, Computer Science, Economics, Engineering, Health, History, Law, Math, Philosophy, Physics, Psychology, and Others.
35
 
36
  Note: For inclusion in our leaderboard, submissions must provide substantial evidence demonstrating that their system is a genuine language model. We maintain strict verification standards to ensure the integrity and comparability of the results.
37
 
@@ -116,8 +113,6 @@ def add_new_eval(
116
  upload_data = json.loads(input_file)
117
  print("upload_data:\n", upload_data)
118
  data_row = [f'{upload_data["Model"]}', upload_data['Overall']]
119
- for subject in SUBJECTS:
120
- data_row += [upload_data[subject]]
121
  print("data_row:\n", data_row)
122
  submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
123
  use_auth_token=HF_TOKEN, repo_type="dataset")
@@ -194,7 +189,7 @@ def get_size_range(df):
194
 
195
 
196
  def process_model_size(size):
197
- if pd.isna(size) or size == 'unk':
198
  return 'unknown'
199
  try:
200
  val = float(size)
@@ -207,12 +202,10 @@ def filter_columns_by_subjects(df, selected_subjects=None):
207
  if selected_subjects is None or len(selected_subjects) == 0:
208
  return df[COLUMN_NAMES]
209
 
210
- base_columns = ['Models', 'Model Size(B)', 'Data Source', 'Overall']
211
  selected_columns = base_columns + selected_subjects
212
 
213
  available_columns = [col for col in selected_columns if col in df.columns]
214
  return df[available_columns]
215
 
216
- def get_subject_choices():
217
- return SUBJECTS
218
 
 
8
 
9
  HF_TOKEN = os.environ.get("HF_TOKEN")
10
 
 
 
11
 
12
  MODEL_INFO = [
13
  "Models", "Model Size(B)", "Data Source",
14
+ "DP Acc", "DP False Positive Rate", "DP False Negative Score", "DP MCC",
15
+ "CoT Acc", "CoT False Positive Rate", "CoT False Negative Score", "CoT MCC"
16
+ ]
17
+
18
 
19
+ DATA_TITLE_TYPE = ['markdown', 'str', 'markdown',
20
+ 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
 
 
21
 
22
+ SUBMISSION_NAME = "chumor_leaderboard_submission"
23
+ SUBMISSION_URL = os.path.join("https://huggingface.co/spaces/dnaihao/", SUBMISSION_NAME)
24
+ CSV_DIR = "./chumor_leaderboard_submission/results.csv"
25
 
26
  COLUMN_NAMES = MODEL_INFO
27
 
28
+ LEADERBOARD_INTRODUCTION = """# Chumor Leaderboard
29
 
30
  ## Introduction
31
+ We introduce Chumor, an enhanced benchmark designed to evaluate language understanding models across broader and more challenging tasks. Building on the Massive Multitask Language Understanding (MMLU) dataset, MMLU-Pro integrates more challenging, reasoning-focused questions and increases the answer choices per question from four to ten, significantly raising the difficulty and reducing the chance of success through random guessing. MMLU-Pro comprises over 12,000 rigorously curated questions from academic exams and textbooks, spanning 14 diverse domains including Biology, Business, Chemistry, Computer Science, Economics, Engineering, Health, History, Law, Math, Philosophy, Physics, Psychology, and Others.
32
 
33
  Note: For inclusion in our leaderboard, submissions must provide substantial evidence demonstrating that their system is a genuine language model. We maintain strict verification standards to ensure the integrity and comparability of the results.
34
 
 
113
  upload_data = json.loads(input_file)
114
  print("upload_data:\n", upload_data)
115
  data_row = [f'{upload_data["Model"]}', upload_data['Overall']]
 
 
116
  print("data_row:\n", data_row)
117
  submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
118
  use_auth_token=HF_TOKEN, repo_type="dataset")
 
189
 
190
 
191
  def process_model_size(size):
192
+ if pd.isna(size) or size == 'unk' or size == "-":
193
  return 'unknown'
194
  try:
195
  val = float(size)
 
202
  if selected_subjects is None or len(selected_subjects) == 0:
203
  return df[COLUMN_NAMES]
204
 
205
+ base_columns = ['Models', 'Model Size(B)', 'Data Source', 'DP Acc']
206
  selected_columns = base_columns + selected_subjects
207
 
208
  available_columns = [col for col in selected_columns if col in df.columns]
209
  return df[available_columns]
210
 
 
 
211