ubowang commited on
Commit
3388e82
·
verified ·
1 Parent(s): 8ff2460

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +42 -96
utils.py CHANGED
@@ -8,103 +8,42 @@ from huggingface_hub import Repository
8
 
9
  HF_TOKEN = os.environ.get("HF_TOKEN")
10
 
 
 
 
11
  MODEL_INFO = [
12
- "Model (CoT)",
13
- "Avg",
14
- "TheoremQA",
15
- "MATH",
16
- "GSM",
17
- "GPQA",
18
- "MMLU-STEM"
19
- ]
20
-
21
- DATA_TITILE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number', 'number']
22
-
23
- SUBMISSION_NAME = "science_leaderboard_submission"
24
  SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/TIGER-Lab/", SUBMISSION_NAME)
25
- CSV_DIR = "./science_leaderboard_submission/results.csv"
26
 
27
  COLUMN_NAMES = MODEL_INFO
28
 
29
- LEADERBORAD_INTRODUCTION = """# Science Leaderboard
30
-
31
- **"Which large language model is the BEST on scinece and engineering?"**<br>
32
- 🏆 Welcome to the **Science** leaderboard! The leaderboard covers the most popular evaluation for different science subjects including math, phyiscs, biology, chemistry, computer science, finance.
33
- <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
34
- </div>
35
- The evaluation set from the following datasets are being included in the leaderboard.
36
- <ul>
37
- <li> MATH (4-shot): this contains the test set of 5000 questions from American Math contest covering different fields like algebra, calculus, statistics, geometry, linear algebra, number theory.
38
- <li> GSM8K (4-shot): this contains the test set of 1320 questions from grade school math word problems. This dataset is mainly covering algebra problems.
39
- <li> TheoremQA (5-shot): this contains the test set of 800 questions collected from college-level exams. This covers math, physics, engineering and finance.
40
- <li> GPQA (5-shot): this contains the test of 198 questions from college-level dataset GPQA-diamond. This covers many fields like chemistry, genetics, biology, etc.
41
- <li> MMLU-STEM (5-shot): this contains the test of 3.3K questions from MMLU dataset. This covers many fields like math, chemistry, genetics, biology, computer science, anatomy, astronomy, etc.
42
- </ul>
43
-
44
- **"How to evaluate your model and submit your results?"**<br>
45
- Please refer to the guideline in <a href="https://github.com/TIGER-AI-Lab/MAmmoTH/blob/main/math_eval/README.md">Github</a> to evaluate your own model.
46
-
47
- <a href='https://hits.seeyoufarm.com'><img src='https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fhuggingface.co%2Fspaces%2FTIGER-Lab%2FTheoremQA-Leaderboard&count_bg=%23C7C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false'></a>
48
  """
49
 
50
  TABLE_INTRODUCTION = """
51
  """
52
 
53
- LEADERBORAD_INFO = """
54
  We list the information of the used datasets as follows:<br>
55
 
56
- MATH: Measuring Mathematical Problem Solving With the MATH Dataset<br>
57
- <a href='https://arxiv.org/pdf/2103.03874.pdf'>Paper</a><br>
58
- <a href='https://github.com/hendrycks/math'>Code</a><br>
59
-
60
- GSM8K: Training Verifiers to Solve Math Word Problems<br>
61
- <a href='https://arxiv.org/pdf/2110.14168.pdf'>Paper</a><br>
62
- <a href='https://github.com/openai/grade-school-math'>Code</a><br>
63
-
64
- TheoremQA: A Theorem-driven Question Answering dataset<br>
65
- <a href='https://arxiv.org/pdf/2305.12524.pdf'>Paper</a><br>
66
- <a href='https://github.com/TIGER-AI-Lab/TheoremQA'>Code</a><br>
67
-
68
- GPQA: A Graduate-Level Google-Proof Q&A Benchmark<br>
69
- <a href='https://arxiv.org/pdf/2311.12022.pdf'>Paper</a><br>
70
- <a href='https://github.com/idavidrein/gpqa'>Code</a>
71
-
72
- MMLU: Measuring Massive Multitask Language Understanding<br>
73
- <a href='https://arxiv.org/pdf/2009.03300.pdf'>Paper</a><br>
74
- <a href='https://github.com/hendrycks/test'>Code</a>
75
  """
76
 
77
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
78
- CITATION_BUTTON_TEXT = r"""@inproceedings{hendrycks2021measuring,
79
- title={Measuring Mathematical Problem Solving With the MATH Dataset},
80
- author={Hendrycks, Dan and Burns, Collin and Kadavath, Saurav and Arora, Akul and Basart, Steven and Tang, Eric and Song, Dawn and Steinhardt, Jacob},
81
- booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)},
82
- year={2021}
83
- }
84
- @article{cobbe2021training,
85
- title={Training verifiers to solve math word problems},
86
- author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and others},
87
- journal={arXiv preprint arXiv:2110.14168},
88
- year={2021}
89
- }
90
- @inproceedings{chen2023theoremqa,
91
- title={Theoremqa: A theorem-driven question answering dataset},
92
- author={Chen, Wenhu and Yin, Ming and Ku, Max and Lu, Pan and Wan, Yixin and Ma, Xueguang and Xu, Jianyu and Wang, Xinyi and Xia, Tony},
93
- booktitle={The 2023 Conference on Empirical Methods in Natural Language Processing},
94
- year={2023}
95
- }
96
- @article{rein2023gpqa,
97
- title={Gpqa: A graduate-level google-proof q\&a benchmark},
98
- author={Rein, David and Hou, Betty Li and Stickland, Asa Cooper and Petty, Jackson and Pang, Richard Yuanzhe and Dirani, Julien and Michael, Julian and Bowman, Samuel R},
99
- journal={arXiv preprint arXiv:2311.12022},
100
- year={2023}
101
- }
102
- @inproceedings{hendrycks2020measuring,
103
- title={Measuring Massive Multitask Language Understanding},
104
- author={Hendrycks, Dan and Burns, Collin and Basart, Steven and Zou, Andy and Mazeika, Mantas and Song, Dawn and Steinhardt, Jacob},
105
- booktitle={International Conference on Learning Representations},
106
- year={2020}
107
- }"""
108
 
109
  SUBMIT_INTRODUCTION = """# Submit on Science Leaderboard Introduction
110
 
@@ -112,36 +51,40 @@ SUBMIT_INTRODUCTION = """# Submit on Science Leaderboard Introduction
112
 
113
  ```json
114
  {
115
- "Model": "[NAME]",
116
- "Repo": "https://huggingface.co/[MODEL_NAME]"
117
- "TheoremQA": 50,
118
- "MATH": 50,
119
- "GSM": 50,
120
- "GPQA": 50,
121
- "MMLU-STEM": 50
122
  }
123
  ```
124
- After submitting, you can click the "Refresh" button to see the updated leaderboard(it may takes few seconds).
125
 
126
  """
 
 
127
  def get_df():
128
  repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
129
  repo.git_pull()
130
  df = pd.read_csv(CSV_DIR)
131
- df['Avg'] = df[['TheoremQA', 'MATH', 'GSM', 'GPQA', 'MMLU-STEM']].mean(axis=1).round(1)
132
- df = df.sort_values(by=['Avg'], ascending=False)
133
  return df[COLUMN_NAMES]
134
 
 
135
  def add_new_eval(
136
  input_file,
137
  ):
138
  if input_file is None:
139
  return "Error! Empty file!"
140
 
141
- upload_data=json.loads(input_file)
142
- data_row = [f'[{upload_data["Model"]}]({upload_data["Repo"]})', upload_data['TheoremQA'], upload_data['MATH'], upload_data['GSM'], upload_data['GPQA'], upload_data['MMLU-STEM']]
 
 
143
 
144
- submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN, repo_type="dataset")
 
145
  submission_repo.git_pull()
146
 
147
  already_submitted = []
@@ -162,4 +105,7 @@ def add_new_eval(
162
 
163
 
164
  def refresh_data():
165
- return get_df()
 
 
 
 
8
 
9
  HF_TOKEN = os.environ.get("HF_TOKEN")
10
 
11
+ SUBJECTS = ["Biology", "Business", "Chemistry", "Computer Science", "Economics", "Engineering",
12
+ "Health", "History", "Law", "Math", "Philosophy", "Physics", "Psychology", "Other"]
13
+
14
  MODEL_INFO = [
15
+ "Models",
16
+ "Overall",
17
+ "Biology", "Business", "Chemistry", "Computer Science", "Economics", "Engineering",
18
+ "Health", "History", "Law", "Math", "Philosophy", "Physics", "Psychology", "Other"]
19
+
20
+ DATA_TITLE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number', 'number',
21
+ 'number', 'number', 'number', 'number', 'number', 'number', 'number',
22
+ 'number', 'number']
23
+
24
+ SUBMISSION_NAME = "mmlu_pro_leaderboard_submission"
 
 
25
  SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/TIGER-Lab/", SUBMISSION_NAME)
26
+ CSV_DIR = "./mmlu_pro_leaderboard_submission/results.csv"
27
 
28
  COLUMN_NAMES = MODEL_INFO
29
 
30
+ LEADERBOARD_INTRODUCTION = """# MMLU-Pro Leaderboard
31
+
32
+ MMLU-Pro dataset, a more robust and challenging massive multi-task understanding dataset tailored to more rigorously benchmark large language models' capabilities. This dataset contains 12K complex questions across various disciplines. The following are the accuracies of various models evaluated on MMLU-Pro.
33
+
34
+ Our dataset is available at [https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro). If you want to reproduce our results or evaluate your own models on MMLU-Pro, please check out our evaluation scripts at [https://github.com/TIGER-AI-Lab/MMLU-Pro](https://github.com/TIGER-AI-Lab/MMLU-Pro).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  """
36
 
37
  TABLE_INTRODUCTION = """
38
  """
39
 
40
+ LEADERBOARD_INFO = """
41
  We list the information of the used datasets as follows:<br>
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  """
44
 
45
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
46
+ CITATION_BUTTON_TEXT = r""""""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  SUBMIT_INTRODUCTION = """# Submit on Science Leaderboard Introduction
49
 
 
51
 
52
  ```json
53
  {
54
+ "Model": "[MODEL_NAME]",
55
+ "Overall": 0.5678,
56
+ "Biology": 0.1234,
57
+ "Business": 0.4567,
58
+ ...,
59
+ "Other: 0.3456"
 
60
  }
61
  ```
62
+ After submitting, you can click the "Refresh" button to see the updated leaderboard (it may takes few seconds).
63
 
64
  """
65
+
66
+
67
  def get_df():
68
  repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
69
  repo.git_pull()
70
  df = pd.read_csv(CSV_DIR)
71
+ df = df.sort_values(by=['Overall'], ascending=False)
 
72
  return df[COLUMN_NAMES]
73
 
74
+
75
  def add_new_eval(
76
  input_file,
77
  ):
78
  if input_file is None:
79
  return "Error! Empty file!"
80
 
81
+ upload_data = json.loads(input_file)
82
+ data_row = [f'{upload_data["Model"]}', upload_data['Overall']]
83
+ for subject in SUBJECTS:
84
+ data_row += [upload_data[subject]]
85
 
86
+ submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
87
+ use_auth_token=HF_TOKEN, repo_type="dataset")
88
  submission_repo.git_pull()
89
 
90
  already_submitted = []
 
105
 
106
 
107
  def refresh_data():
108
+ return get_df()
109
+
110
+
111
+