haoyang commited on
Commit
e7a7ef0
β€’
1 Parent(s): df8d8ab

update leaderboard

Browse files
Files changed (4) hide show
  1. app.py +88 -1
  2. src/display/about.py +19 -18
  3. src/envs.py +1 -0
  4. src/leaderboard/read_evals.py +4 -0
app.py CHANGED
@@ -25,13 +25,21 @@ from src.display.utils import (
25
  WeightType,
26
  Precision
27
  )
28
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, REPO_ID, RESULTS_REPO
29
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
 
30
 
31
 
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID, token=TOKEN)
34
 
 
 
 
 
 
 
 
35
  try:
36
  print(EVAL_RESULTS_PATH)
37
  snapshot_download(
@@ -44,6 +52,11 @@ except Exception:
44
  raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
45
  leaderboard_df = original_df.copy()
46
 
 
 
 
 
 
47
 
48
  # Searching and filtering
49
  def update_table(
@@ -224,6 +237,80 @@ with demo:
224
 
225
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=3):
226
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
  with gr.Row():
229
  with gr.Accordion("πŸ“™ Citation", open=False):
 
25
  WeightType,
26
  Precision
27
  )
28
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO
29
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
30
+ from src.submission.submit import add_new_eval
31
 
32
 
33
  def restart_space():
34
  API.restart_space(repo_id=REPO_ID, token=TOKEN)
35
 
36
+ try:
37
+ print(EVAL_REQUESTS_PATH)
38
+ snapshot_download(
39
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
40
+ )
41
+ except Exception:
42
+ restart_space()
43
  try:
44
  print(EVAL_RESULTS_PATH)
45
  snapshot_download(
 
52
  raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
  leaderboard_df = original_df.copy()
54
 
55
+ (
56
+ finished_eval_queue_df,
57
+ running_eval_queue_df,
58
+ pending_eval_queue_df,
59
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
60
 
61
  # Searching and filtering
62
  def update_table(
 
237
 
238
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=3):
239
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
240
+
241
+ with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
242
+ with gr.Column():
243
+ with gr.Row():
244
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
245
+
246
+ with gr.Column():
247
+ with gr.Accordion(
248
+ f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
249
+ open=False,
250
+ ):
251
+ with gr.Row():
252
+ finished_eval_table = gr.components.Dataframe(
253
+ value=finished_eval_queue_df,
254
+ headers=EVAL_COLS,
255
+ datatype=EVAL_TYPES,
256
+ row_count=5,
257
+ )
258
+ with gr.Accordion(
259
+ f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
260
+ open=False,
261
+ ):
262
+ with gr.Row():
263
+ running_eval_table = gr.components.Dataframe(
264
+ value=running_eval_queue_df,
265
+ headers=EVAL_COLS,
266
+ datatype=EVAL_TYPES,
267
+ row_count=5,
268
+ )
269
+
270
+ with gr.Accordion(
271
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
272
+ open=False,
273
+ ):
274
+ with gr.Row():
275
+ pending_eval_table = gr.components.Dataframe(
276
+ value=pending_eval_queue_df,
277
+ headers=EVAL_COLS,
278
+ datatype=EVAL_TYPES,
279
+ row_count=5,
280
+ )
281
+ with gr.Row():
282
+ gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
283
+ with gr.Column():
284
+ precision = gr.Dropdown(
285
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
286
+ label="Precision",
287
+ multiselect=False,
288
+ value="float16",
289
+ interactive=True,
290
+ )
291
+ weight_type = gr.Dropdown(
292
+ choices=[i.value.name for i in WeightType],
293
+ label="Weights type",
294
+ multiselect=False,
295
+ value="Original",
296
+ interactive=True,
297
+ )
298
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
299
+
300
+ submit_button = gr.Button("Submit Eval")
301
+ submission_result = gr.Markdown()
302
+ submit_button.click(
303
+ add_new_eval,
304
+ [
305
+ model_name_textbox,
306
+ base_model_name_textbox,
307
+ revision_name_textbox,
308
+ precision,
309
+ weight_type,
310
+ model_type,
311
+ ],
312
+ submission_result,
313
+ )
314
 
315
  with gr.Row():
316
  with gr.Accordion("πŸ“™ Citation", open=False):
src/display/about.py CHANGED
@@ -28,6 +28,25 @@ TITLE = """<h1 align="center" id="space-title">NPHardEval leaderboard</h1>"""
28
  # What does your leaderboard evaluate?
29
  INTRODUCTION_TEXT = """
30
  NPHardEval serves as a comprehensive benchmark for assessing the reasoning abilities of large language models (LLMs) through the lens of computational complexity classes.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  [Our repository](https://github.com/casmlab/NPHardEval) contains datasets, data generation scripts, and experimental procedures designed to evaluate LLMs in various reasoning tasks.
32
  In particular, we use three complexity classes to define the task complexity in the benchmark, including P (polynomial time), NP-complete (nondeterministic polynomial-time complete),
33
  and NP-hard, which are increasingly complex in both the intrinsic difficulty and the resources needed to solve them. The selected nine problems are:
@@ -50,24 +69,6 @@ Our benchmark offers several advantages compared with current benchmarks:
50
  - Automatic checking mechanisms
51
  - Automatic generation of datapoints
52
  - Complete focus on reasoning while exclude numerical computation
53
- """
54
-
55
- # Which evaluations are you running? how can people reproduce what you have?
56
- LLM_BENCHMARKS_TEXT = f"""
57
- The paramount importance of complex reasoning in Large Language Models (LLMs) is well-recognized,
58
- especially in their application to intricate decision-making tasks. This underscores the necessity
59
- of thoroughly investigating LLMs' reasoning capabilities. To this end, various benchmarks have been
60
- developed to evaluate these capabilities. However, existing benchmarks fall short in providing a
61
- comprehensive assessment of LLMs' potential in reasoning. Additionally, there is a risk of overfitting,
62
- as these benchmarks are static and publicly accessible, allowing models to tailor responses to specific
63
- metrics, thus artificially boosting their performance.
64
-
65
- In response, our research introduces 'NPHardEval,' a novel benchmark meticulously designed to
66
- comprehensively evaluate LLMs' reasoning abilities. It comprises a diverse array of 900 algorithmic
67
- questions, spanning the spectrum up to NP-Hard complexity. These questions are strategically selected
68
- to cover a vast range of complexities, ensuring a thorough evaluation of LLMs' reasoning power. This
69
- benchmark not only offers insights into the current state of reasoning in LLMs but also establishes
70
- a benchmark for comparing LLMs' performance across various complexity classes.
71
 
72
  Our study marks a significant contribution to understanding LLMs' current reasoning capabilities
73
  and paves the way for future enhancements. Furthermore, NPHardEval features a dynamic update mechanism,
 
28
  # What does your leaderboard evaluate?
29
  INTRODUCTION_TEXT = """
30
  NPHardEval serves as a comprehensive benchmark for assessing the reasoning abilities of large language models (LLMs) through the lens of computational complexity classes.
31
+ """
32
+
33
+ # Which evaluations are you running? how can people reproduce what you have?
34
+ LLM_BENCHMARKS_TEXT = f"""
35
+ The paramount importance of complex reasoning in Large Language Models (LLMs) is well-recognized,
36
+ especially in their application to intricate decision-making tasks. This underscores the necessity
37
+ of thoroughly investigating LLMs' reasoning capabilities. To this end, various benchmarks have been
38
+ developed to evaluate these capabilities. However, existing benchmarks fall short in providing a
39
+ comprehensive assessment of LLMs' potential in reasoning. Additionally, there is a risk of overfitting,
40
+ as these benchmarks are static and publicly accessible, allowing models to tailor responses to specific
41
+ metrics, thus artificially boosting their performance.
42
+
43
+ In response, our research introduces 'NPHardEval,' a novel benchmark meticulously designed to
44
+ comprehensively evaluate LLMs' reasoning abilities. It comprises a diverse array of 900 algorithmic
45
+ questions, spanning the spectrum up to NP-Hard complexity. These questions are strategically selected
46
+ to cover a vast range of complexities, ensuring a thorough evaluation of LLMs' reasoning power. This
47
+ benchmark not only offers insights into the current state of reasoning in LLMs but also establishes
48
+ a benchmark for comparing LLMs' performance across various complexity classes.
49
+
50
  [Our repository](https://github.com/casmlab/NPHardEval) contains datasets, data generation scripts, and experimental procedures designed to evaluate LLMs in various reasoning tasks.
51
  In particular, we use three complexity classes to define the task complexity in the benchmark, including P (polynomial time), NP-complete (nondeterministic polynomial-time complete),
52
  and NP-hard, which are increasingly complex in both the intrinsic difficulty and the resources needed to solve them. The selected nine problems are:
 
69
  - Automatic checking mechanisms
70
  - Automatic generation of datapoints
71
  - Complete focus on reasoning while exclude numerical computation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  Our study marks a significant contribution to understanding LLMs' current reasoning capabilities
74
  and paves the way for future enhancements. Furthermore, NPHardEval features a dynamic update mechanism,
src/envs.py CHANGED
@@ -7,6 +7,7 @@ TOKEN = os.environ.get("TOKEN", None)
7
 
8
  OWNER = "hyfrankl"
9
  REPO_ID = f"{OWNER}/NPHardEval-leaderboard"
 
10
  RESULTS_REPO = f"{OWNER}/NPHardEval-results"
11
 
12
  CACHE_PATH=os.getenv("HF_HOME", ".")
 
7
 
8
  OWNER = "hyfrankl"
9
  REPO_ID = f"{OWNER}/NPHardEval-leaderboard"
10
+ QUEUE_REPO = f"{OWNER}/NPHardEval-requests"
11
  RESULTS_REPO = f"{OWNER}/NPHardEval-results"
12
 
13
  CACHE_PATH=os.getenv("HF_HOME", ".")
src/leaderboard/read_evals.py CHANGED
@@ -41,6 +41,9 @@ class EvalResult:
41
  # Precision
42
  precision = Precision.from_str(config.get("model_dtype"))
43
 
 
 
 
44
  # Get model and org
45
  org_and_model = config.get("model_name", config.get("model_args", None))
46
  org_and_model = org_and_model.split("/", 1)
@@ -89,6 +92,7 @@ class EvalResult:
89
  still_on_hub=still_on_hub,
90
  architecture=architecture,
91
  model_type=model_type,
 
92
  )
93
 
94
  def update_with_request_file(self, requests_path):
 
41
  # Precision
42
  precision = Precision.from_str(config.get("model_dtype"))
43
 
44
+ # Params
45
+ num_params = config.get("num_params", 0)
46
+
47
  # Get model and org
48
  org_and_model = config.get("model_name", config.get("model_args", None))
49
  org_and_model = org_and_model.split("/", 1)
 
92
  still_on_hub=still_on_hub,
93
  architecture=architecture,
94
  model_type=model_type,
95
+ num_params=num_params,
96
  )
97
 
98
  def update_with_request_file(self, requests_path):