Spaces:

NPHardEval
/

NPHardEval-leaderboard

Running

App Files Files Community

haoyang commited on Jan 19, 2024

Commit

e7a7ef0

1 Parent(s): df8d8ab

update leaderboard

Browse files

Files changed (4) hide show

app.py +88 -1
src/display/about.py +19 -18
src/envs.py +1 -0
src/leaderboard/read_evals.py +4 -0

app.py CHANGED Viewed

@@ -25,13 +25,21 @@ from src.display.utils import (
     WeightType,
     Precision
 )
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, REPO_ID, RESULTS_REPO
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=TOKEN)
 try:
     print(EVAL_RESULTS_PATH)
     snapshot_download(
@@ -44,6 +52,11 @@ except Exception:
 raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 leaderboard_df = original_df.copy()
 # Searching and filtering
 def update_table(
@@ -224,6 +237,80 @@ with demo:
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):

     WeightType,
     Precision
 )
+from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
+from src.submission.submit import add_new_eval
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=TOKEN)
+try:
+    print(EVAL_REQUESTS_PATH)
+    snapshot_download(
+        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
+    )
+except Exception:
+    restart_space()
 try:
     print(EVAL_RESULTS_PATH)
     snapshot_download(
 raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 leaderboard_df = original_df.copy()
+(
+    finished_eval_queue_df,
+    running_eval_queue_df,
+    pending_eval_queue_df,
+) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 # Searching and filtering
 def update_table(
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
+            with gr.Column():
+                with gr.Row():
+                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+                with gr.Column():
+                    with gr.Accordion(
+                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            finished_eval_table = gr.components.Dataframe(
+                                value=finished_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+                    with gr.Accordion(
+                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            running_eval_table = gr.components.Dataframe(
+                                value=running_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+                    with gr.Accordion(
+                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            pending_eval_table = gr.components.Dataframe(
+                                value=pending_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+            with gr.Row():
+                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
+                with gr.Column():
+                    precision = gr.Dropdown(
+                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
+                        label="Precision",
+                        multiselect=False,
+                        value="float16",
+                        interactive=True,
+                    )
+                    weight_type = gr.Dropdown(
+                        choices=[i.value.name for i in WeightType],
+                        label="Weights type",
+                        multiselect=False,
+                        value="Original",
+                        interactive=True,
+                    )
+                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
+            submit_button = gr.Button("Submit Eval")
+            submission_result = gr.Markdown()
+            submit_button.click(
+                add_new_eval,
+                [
+                    model_name_textbox,
+                    base_model_name_textbox,
+                    revision_name_textbox,
+                    precision,
+                    weight_type,
+                    model_type,
+                ],
+                submission_result,
+            )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):

src/display/about.py CHANGED Viewed

@@ -28,6 +28,25 @@ TITLE = """<h1 align="center" id="space-title">NPHardEval leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
 NPHardEval serves as a comprehensive benchmark for assessing the reasoning abilities of large language models (LLMs) through the lens of computational complexity classes.
 [Our repository](https://github.com/casmlab/NPHardEval) contains datasets, data generation scripts, and experimental procedures designed to evaluate LLMs in various reasoning tasks.
 In particular, we use three complexity classes to define the task complexity in the benchmark, including P (polynomial time), NP-complete (nondeterministic polynomial-time complete),
 and NP-hard, which are increasingly complex in both the intrinsic difficulty and the resources needed to solve them. The selected nine problems are:
@@ -50,24 +69,6 @@ Our benchmark offers several advantages compared with current benchmarks:
 - Automatic checking mechanisms
 - Automatic generation of datapoints
 - Complete focus on reasoning while exclude numerical computation
-"""
-# Which evaluations are you running? how can people reproduce what you have?
-LLM_BENCHMARKS_TEXT = f"""
-The paramount importance of complex reasoning in Large Language Models (LLMs) is well-recognized,
-especially in their application to intricate decision-making tasks. This underscores the necessity
-of thoroughly investigating LLMs' reasoning capabilities. To this end, various benchmarks have been
-developed to evaluate these capabilities. However, existing benchmarks fall short in providing a
-comprehensive assessment of LLMs' potential in reasoning. Additionally, there is a risk of overfitting,
-as these benchmarks are static and publicly accessible, allowing models to tailor responses to specific
-metrics, thus artificially boosting their performance.
-In response, our research introduces 'NPHardEval,' a novel benchmark meticulously designed to
-comprehensively evaluate LLMs' reasoning abilities. It comprises a diverse array of 900 algorithmic
-questions, spanning the spectrum up to NP-Hard complexity. These questions are strategically selected
-to cover a vast range of complexities, ensuring a thorough evaluation of LLMs' reasoning power. This
-benchmark not only offers insights into the current state of reasoning in LLMs but also establishes
-a benchmark for comparing LLMs' performance across various complexity classes.
 Our study marks a significant contribution to understanding LLMs' current reasoning capabilities
 and paves the way for future enhancements. Furthermore, NPHardEval features a dynamic update mechanism,

 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
 NPHardEval serves as a comprehensive benchmark for assessing the reasoning abilities of large language models (LLMs) through the lens of computational complexity classes.
+"""
+# Which evaluations are you running? how can people reproduce what you have?
+LLM_BENCHMARKS_TEXT = f"""
+The paramount importance of complex reasoning in Large Language Models (LLMs) is well-recognized,
+especially in their application to intricate decision-making tasks. This underscores the necessity
+of thoroughly investigating LLMs' reasoning capabilities. To this end, various benchmarks have been
+developed to evaluate these capabilities. However, existing benchmarks fall short in providing a
+comprehensive assessment of LLMs' potential in reasoning. Additionally, there is a risk of overfitting,
+as these benchmarks are static and publicly accessible, allowing models to tailor responses to specific
+metrics, thus artificially boosting their performance.
+In response, our research introduces 'NPHardEval,' a novel benchmark meticulously designed to
+comprehensively evaluate LLMs' reasoning abilities. It comprises a diverse array of 900 algorithmic
+questions, spanning the spectrum up to NP-Hard complexity. These questions are strategically selected
+to cover a vast range of complexities, ensuring a thorough evaluation of LLMs' reasoning power. This
+benchmark not only offers insights into the current state of reasoning in LLMs but also establishes
+a benchmark for comparing LLMs' performance across various complexity classes.
 [Our repository](https://github.com/casmlab/NPHardEval) contains datasets, data generation scripts, and experimental procedures designed to evaluate LLMs in various reasoning tasks.
 In particular, we use three complexity classes to define the task complexity in the benchmark, including P (polynomial time), NP-complete (nondeterministic polynomial-time complete),
 and NP-hard, which are increasingly complex in both the intrinsic difficulty and the resources needed to solve them. The selected nine problems are:
 - Automatic checking mechanisms
 - Automatic generation of datapoints
 - Complete focus on reasoning while exclude numerical computation
 Our study marks a significant contribution to understanding LLMs' current reasoning capabilities
 and paves the way for future enhancements. Furthermore, NPHardEval features a dynamic update mechanism,

src/envs.py CHANGED Viewed

@@ -7,6 +7,7 @@ TOKEN = os.environ.get("TOKEN", None)
 OWNER = "hyfrankl"
 REPO_ID = f"{OWNER}/NPHardEval-leaderboard"
 RESULTS_REPO = f"{OWNER}/NPHardEval-results"
 CACHE_PATH=os.getenv("HF_HOME", ".")

 OWNER = "hyfrankl"
 REPO_ID = f"{OWNER}/NPHardEval-leaderboard"
+QUEUE_REPO = f"{OWNER}/NPHardEval-requests"
 RESULTS_REPO = f"{OWNER}/NPHardEval-results"
 CACHE_PATH=os.getenv("HF_HOME", ".")

src/leaderboard/read_evals.py CHANGED Viewed

@@ -41,6 +41,9 @@ class EvalResult:
         # Precision
         precision = Precision.from_str(config.get("model_dtype"))
         # Get model and org
         org_and_model = config.get("model_name", config.get("model_args", None))
         org_and_model = org_and_model.split("/", 1)
@@ -89,6 +92,7 @@ class EvalResult:
             still_on_hub=still_on_hub,
             architecture=architecture,
             model_type=model_type,
         )
     def update_with_request_file(self, requests_path):

         # Precision
         precision = Precision.from_str(config.get("model_dtype"))
+        # Params
+        num_params = config.get("num_params", 0)
         # Get model and org
         org_and_model = config.get("model_name", config.get("model_args", None))
         org_and_model = org_and_model.split("/", 1)
             still_on_hub=still_on_hub,
             architecture=architecture,
             model_type=model_type,
+            num_params=num_params,
         )
     def update_with_request_file(self, requests_path):