Chumor

Running

App Files Files Community

dnaihao commited on Dec 22, 2024

Commit

5d9b035

1 Parent(s): 39793ac

Initial submission

Browse files

Files changed (4) hide show

README.md +2 -2
app.py +2 -2
chumor_leaderboard_submission/result.csv +12 -0
utils.py +13 -20

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: MMLU Pro
 emoji: 🥇
 colorFrom: green
 colorTo: indigo
@@ -43,4 +43,4 @@ If you encounter problem on the space, don't hesitate to restart it to remove th
 You'll find
 - the main table' columns names and properties in `src/display/utils.py`
 - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
-- teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

 ---
+title: Chumor
 emoji: 🥇
 colorFrom: green
 colorTo: indigo
 You'll find
 - the main table' columns names and properties in `src/display/utils.py`
 - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
+- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ def update_table(query, min_size, max_size, selected_subjects=None):
     df = get_df()
     filtered_df = search_and_filter_models(df, query, min_size, max_size)
     if selected_subjects and len(selected_subjects) > 0:
-        base_columns = ['Models', 'Model Size(B)', 'Data Source', 'Overall']
         selected_columns = base_columns + selected_subjects
         filtered_df = filtered_df[selected_columns]
     return filtered_df
@@ -54,7 +54,7 @@ with gr.Blocks() as block:
                 )
             # 添加学科选择器
-            subject_choices = [col for col in COLUMN_NAMES if col not in ['Models', 'Model Size(B)', 'Data Source', 'Overall']]
             with gr.Row():
                 subjects_select = gr.CheckboxGroup(
                     choices=subject_choices,

     df = get_df()
     filtered_df = search_and_filter_models(df, query, min_size, max_size)
     if selected_subjects and len(selected_subjects) > 0:
+        base_columns = ['Models', 'Model Size(B)', 'Data Source', 'DP Acc']
         selected_columns = base_columns + selected_subjects
         filtered_df = filtered_df[selected_columns]
     return filtered_df
                 )
             # 添加学科选择器
+            subject_choices = [col for col in COLUMN_NAMES if col not in ['Models', 'Model Size(B)', 'Data Source', 'DP Acc']]
             with gr.Row():
                 subjects_select = gr.CheckboxGroup(
                     choices=subject_choices,

chumor_leaderboard_submission/result.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+Models,Model Size(B),Data Source,DP Acc,DP False Positive Rate,DP False Negative Score,DP MCC,CoT Acc,CoT False Positive Rate,CoT False Negative Score,CoT MCC
+Human,-,[LIT Lab](https://arxiv.org/abs/2406.12754),78.30,,,0.60,78.30,,,0.60
+Athene,70B,[LIT Lab](https://arxiv.org/abs/2406.12754),44.59,97.83,0.28,0.08,47.26,91.10,2.89,0.12
+ERNIE-4-turbo,-,[LIT Lab](https://arxiv.org/abs/2406.12754),60.29,59.83,13.57,0.29,45.16,96.93,0.14,0.11
+Gemini-1.5-pro,-,[LIT Lab](https://arxiv.org/abs/2406.12754),54.00,77.42,5.17,0.24,60.32,33.81,47.31,0.19
+GLM-4-plus,-,[LIT Lab](https://arxiv.org/abs/2406.12754),55.56,72.28,8.26,0.24,58.13,32.96,53.44,0.14
+GPT-4-turbo,-,[LIT Lab](https://arxiv.org/abs/2406.12754),52.32,79.28,6.61,0.20,51.27,80.87,6.96,0.17
+GPT-4o,unk,[LIT Lab](https://arxiv.org/abs/2406.12754),51.87,80.02,6.68,0.19,50.64,85.00,3.03,0.20
+Nemontron,70B,[LIT Lab](https://arxiv.org/abs/2406.12754),56.30,61.26,20.87,0.19,57.17,40.28,46.14,0.14
+Mistral,123B,[LIT Lab](https://arxiv.org/abs/2406.12754),55.56,69.26,12.19,0.22,51.18,79.92,8.40,0.16
+QWen-2.5,72B,[LIT Lab](https://arxiv.org/abs/2406.12754),48.46,90.67,0.69,0.19,49.45,86.91,3.31,0.17
+Yi,34B,[LIT Lab](https://arxiv.org/abs/2406.12754),44.95,97.24,0.21,0.10,47.17,89.30,5.44,0.09

utils.py CHANGED Viewed

@@ -8,30 +8,27 @@ from huggingface_hub import Repository
 HF_TOKEN = os.environ.get("HF_TOKEN")
-SUBJECTS = ["Biology", "Business", "Chemistry", "Computer Science", "Economics", "Engineering",
-            "Health", "History", "Law", "Math", "Philosophy", "Physics", "Psychology", "Other"]
 MODEL_INFO = [
     "Models", "Model Size(B)", "Data Source",
-    "Overall",
-    "Biology", "Business", "Chemistry", "Computer Science", "Economics", "Engineering",
-    "Health", "History", "Law", "Math", "Philosophy", "Physics", "Psychology", "Other"]
-DATA_TITLE_TYPE = ['markdown', 'str', 'markdown', 'number',
-                   'number', 'number', 'number', 'number', 'number', 'number',
-                   'number', 'number', 'number', 'number', 'number', 'number', 'number',
-                   'number']
-SUBMISSION_NAME = "mmlu_pro_leaderboard_submission"
-SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/TIGER-Lab/", SUBMISSION_NAME)
-CSV_DIR = "./mmlu_pro_leaderboard_submission/results.csv"
 COLUMN_NAMES = MODEL_INFO
-LEADERBOARD_INTRODUCTION = """# MMLU-Pro Leaderboard
 ## Introduction
-We introduce MMLU-Pro, an enhanced benchmark designed to evaluate language understanding models across broader and more challenging tasks. Building on the Massive Multitask Language Understanding (MMLU) dataset, MMLU-Pro integrates more challenging, reasoning-focused questions and increases the answer choices per question from four to ten, significantly raising the difficulty and reducing the chance of success through random guessing. MMLU-Pro comprises over 12,000 rigorously curated questions from academic exams and textbooks, spanning 14 diverse domains including Biology, Business, Chemistry, Computer Science, Economics, Engineering, Health, History, Law, Math, Philosophy, Physics, Psychology, and Others.
 Note: For inclusion in our leaderboard, submissions must provide substantial evidence demonstrating that their system is a genuine language model. We maintain strict verification standards to ensure the integrity and comparability of the results.
@@ -116,8 +113,6 @@ def add_new_eval(
     upload_data = json.loads(input_file)
     print("upload_data:\n", upload_data)
     data_row = [f'{upload_data["Model"]}', upload_data['Overall']]
-    for subject in SUBJECTS:
-        data_row += [upload_data[subject]]
     print("data_row:\n", data_row)
     submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
                                  use_auth_token=HF_TOKEN, repo_type="dataset")
@@ -194,7 +189,7 @@ def get_size_range(df):
 def process_model_size(size):
-    if pd.isna(size) or size == 'unk':
         return 'unknown'
     try:
         val = float(size)
@@ -207,12 +202,10 @@ def filter_columns_by_subjects(df, selected_subjects=None):
     if selected_subjects is None or len(selected_subjects) == 0:
         return df[COLUMN_NAMES]
-    base_columns = ['Models', 'Model Size(B)', 'Data Source', 'Overall']
     selected_columns = base_columns + selected_subjects
     available_columns = [col for col in selected_columns if col in df.columns]
     return df[available_columns]
-def get_subject_choices():
-    return SUBJECTS

 HF_TOKEN = os.environ.get("HF_TOKEN")
 MODEL_INFO = [
     "Models", "Model Size(B)", "Data Source",
+    "DP Acc", "DP False Positive Rate", "DP False Negative Score", "DP MCC",
+    "CoT Acc", "CoT False Positive Rate", "CoT False Negative Score", "CoT MCC"
+]
+DATA_TITLE_TYPE = ['markdown', 'str', 'markdown',
+                   'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
+SUBMISSION_NAME = "chumor_leaderboard_submission"
+SUBMISSION_URL = os.path.join("https://huggingface.co/spaces/dnaihao/", SUBMISSION_NAME)
+CSV_DIR = "./chumor_leaderboard_submission/results.csv"
 COLUMN_NAMES = MODEL_INFO
+LEADERBOARD_INTRODUCTION = """# Chumor Leaderboard
 ## Introduction
+We introduce Chumor, an enhanced benchmark designed to evaluate language understanding models across broader and more challenging tasks. Building on the Massive Multitask Language Understanding (MMLU) dataset, MMLU-Pro integrates more challenging, reasoning-focused questions and increases the answer choices per question from four to ten, significantly raising the difficulty and reducing the chance of success through random guessing. MMLU-Pro comprises over 12,000 rigorously curated questions from academic exams and textbooks, spanning 14 diverse domains including Biology, Business, Chemistry, Computer Science, Economics, Engineering, Health, History, Law, Math, Philosophy, Physics, Psychology, and Others.
 Note: For inclusion in our leaderboard, submissions must provide substantial evidence demonstrating that their system is a genuine language model. We maintain strict verification standards to ensure the integrity and comparability of the results.
     upload_data = json.loads(input_file)
     print("upload_data:\n", upload_data)
     data_row = [f'{upload_data["Model"]}', upload_data['Overall']]
     print("data_row:\n", data_row)
     submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
                                  use_auth_token=HF_TOKEN, repo_type="dataset")
 def process_model_size(size):
+    if pd.isna(size) or size == 'unk' or size == "-":
         return 'unknown'
     try:
         val = float(size)
     if selected_subjects is None or len(selected_subjects) == 0:
         return df[COLUMN_NAMES]
+    base_columns = ['Models', 'Model Size(B)', 'Data Source', 'DP Acc']
     selected_columns = base_columns + selected_subjects
     available_columns = [col for col in selected_columns if col in df.columns]
     return df[available_columns]