Spaces:

silma-ai
/

Arabic-LLM-Broad-Leaderboard

Running

App Files Files Community

karimouda commited on May 8

Commit

2fe1d39

1 Parent(s): 9af0c0a

feedback #1 applied

Browse files

Files changed (4) hide show

app.py +14 -10
src/about.py +10 -7
src/leaderboard/read_evals.py +3 -1
src/submission/submit.py +4 -4

app.py CHANGED Viewed

@@ -174,10 +174,10 @@ def get_model_info_blocks(chosen_model_name):
     filtered_df = LEADERBOARD_DF[LEADERBOARD_DF["Model Name"]==model_name_full].reset_index(drop=True)
     skills_bar_df = pd.DataFrame({
         'Skills': skills,
-        'Scores': filtered_df[skills].values[0]
     })
-    skills_bar_df = skills_bar_df.sort_values(by=['Scores'], ascending=False).reset_index(drop=True)
     def get_metric_html(metric_title):
         return f"<div class='deep-dive-metric'><b>{metric_title}</b><span class='ddm-value'>{{}}</div>"
@@ -187,7 +187,7 @@ def get_model_info_blocks(chosen_model_name):
         with gr.Row():
             model_name = gr.HTML(get_metric_html("Model Name").format(chosen_model_name))
         with gr.Row():
-            benchmark_score = gr.HTML(get_metric_html("Benchmark Score").format(filtered_df["Benchmark Score"][0]))
             rank = gr.HTML(get_metric_html("Benchmark Rank").format(filtered_df["Rank"][0]))
             speed = gr.HTML(get_metric_html("Speed <br/>(words per second)").format(filtered_df["Speed (words/sec)"][0]))
             contamination =  gr.HTML(get_metric_html("Contamination Score").format(filtered_df["Contamination Score"][0]))
@@ -197,29 +197,33 @@ def get_model_info_blocks(chosen_model_name):
         skills_bar = gr.BarPlot(
                         value=skills_bar_df,
                         x="Skills",
-                        y="Scores",
                         width=500,
                         height=500,
                         x_label_angle=45,
                         color="Skills",
                         color_title=None,
-                        label="Model Skills",
                         sort="-y"
                     )
-    html_file_content = get_model_answers_html_file(EVAL_RESULTS_PATH, chosen_model_name)
     if html_file_content == "EMPTY":
         answers_html = gr.Markdown("")
     else:
         with gr.Row():
             ##strip style and script tags from html
             html_file_content = re.sub('<style.*?>.*?</style>', '', html_file_content, flags=re.DOTALL)
             html_file_content = re.sub('<script.*?>.*?</script>', '', html_file_content, flags=re.DOTALL)
             html_file_content = html_file_content.replace('<html lang="ar" dir="rtl">','<html>')
             answers_html = gr.HTML(html_file_content,max_height=500,show_label=True,
                                     label="Model Responses", container=True, elem_classes="model_responses_container")
@@ -234,7 +238,7 @@ def init_compare_tab(dataframe):
     model_names = dataframe["Model Name"].unique().tolist()
     model_names_clean = [strip_html_tags(model_name) for model_name in model_names]
     with gr.Row():
-        models_dropdown = gr.Dropdown(choices=model_names_clean, label="Select Model",
                                        value=model_names_clean[0], multiselect=True)
@@ -323,7 +327,7 @@ with demo:
                 prereqs_checkboxes = gr.CheckboxGroup(["I have successfully run the ABB benchmark script on my model using my own infrastructure and I am NOT using the Leaderboard for testing purposes",
                                   "I understand that my account/org have only one submission per month",
                                   "I understand that I can't submit models more than 15B parameters (learn more in the FAQ)",
-                                  "I understand that submitting contaminated models or models to test the contamination score will lead to action from our side including banning and public sharing of the incident"],
                                   label=None, info=None,
                                   elem_classes="submit_prereq_checkboxes_container",
                                   container=False)

     filtered_df = LEADERBOARD_DF[LEADERBOARD_DF["Model Name"]==model_name_full].reset_index(drop=True)
     skills_bar_df = pd.DataFrame({
         'Skills': skills,
+        'Benchmark Score': filtered_df[skills].values[0]
     })
+    skills_bar_df = skills_bar_df.sort_values(by=['Benchmark Score'], ascending=False).reset_index(drop=True)
     def get_metric_html(metric_title):
         return f"<div class='deep-dive-metric'><b>{metric_title}</b><span class='ddm-value'>{{}}</div>"
         with gr.Row():
             model_name = gr.HTML(get_metric_html("Model Name").format(chosen_model_name))
         with gr.Row():
+            benchmark_score = gr.HTML(get_metric_html("Benchmark Score").format(str(filtered_df["Benchmark Score"][0])+"/10"))
             rank = gr.HTML(get_metric_html("Benchmark Rank").format(filtered_df["Rank"][0]))
             speed = gr.HTML(get_metric_html("Speed <br/>(words per second)").format(filtered_df["Speed (words/sec)"][0]))
             contamination =  gr.HTML(get_metric_html("Contamination Score").format(filtered_df["Contamination Score"][0]))
         skills_bar = gr.BarPlot(
                         value=skills_bar_df,
                         x="Skills",
+                        y="Benchmark Score",
                         width=500,
                         height=500,
                         x_label_angle=45,
                         color="Skills",
                         color_title=None,
+                        label=f"{chosen_model_name} model skills",
                         sort="-y"
                     )
+    html_file_content,download_file_path = get_model_answers_html_file(EVAL_RESULTS_PATH, chosen_model_name)
+    print(download_file_path)
     if html_file_content == "EMPTY":
         answers_html = gr.Markdown("")
     else:
+        with gr.Row():
+            gr.Markdown(f"""
+                <a href='{download_file_path}' target='_blank'>Download model answers here</a>
+                """)
         with gr.Row():
             ##strip style and script tags from html
             html_file_content = re.sub('<style.*?>.*?</style>', '', html_file_content, flags=re.DOTALL)
             html_file_content = re.sub('<script.*?>.*?</script>', '', html_file_content, flags=re.DOTALL)
             html_file_content = html_file_content.replace('<html lang="ar" dir="rtl">','<html>')
             answers_html = gr.HTML(html_file_content,max_height=500,show_label=True,
                                     label="Model Responses", container=True, elem_classes="model_responses_container")
     model_names = dataframe["Model Name"].unique().tolist()
     model_names_clean = [strip_html_tags(model_name) for model_name in model_names]
     with gr.Row():
+        models_dropdown = gr.Dropdown(choices=model_names_clean, label="Select Models",
                                        value=model_names_clean[0], multiselect=True)
                 prereqs_checkboxes = gr.CheckboxGroup(["I have successfully run the ABB benchmark script on my model using my own infrastructure and I am NOT using the Leaderboard for testing purposes",
                                   "I understand that my account/org have only one submission per month",
                                   "I understand that I can't submit models more than 15B parameters (learn more in the FAQ)",
+                                  "I understand that submitting contaminated models or models to test the contamination score will lead to action from our side including banning. We also reserve the right to delete any model we think is contaminated without notice."],
                                   label=None, info=None,
                                   elem_classes="submit_prereq_checkboxes_container",
                                   container=False)

src/about.py CHANGED Viewed

@@ -67,28 +67,31 @@ Find more details in the about Tab.
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
-## What is the difference betweem ABL and ABB?
 ABL is the Leaderboard which uses ABB benchmarking dataset and code in the backend to produce the results you see here
-## What can I learn more about ABL and ABB?
 Feel free to read the following resources
 ABB Page:
 ABL blog post:
-## How can I reproduce the results?
 You can easily run the ABB benchmarking code using the following command on Google Collab or your own infratructure.
-## What is the Benchmark Score?
-## What is the Contamination Score?
-## What is the Speed?
-## Why I am not allowed to submit models more than 15B parameters?
 """

 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
+## FAQ
+### What is the difference betweem ABL and ABB?
 ABL is the Leaderboard which uses ABB benchmarking dataset and code in the backend to produce the results you see here
+### What can I learn more about ABL and ABB?
 Feel free to read the following resources
 ABB Page:
 ABL blog post:
+### How can I reproduce the results?
 You can easily run the ABB benchmarking code using the following command on Google Collab or your own infratructure.
+### What is the Benchmark Score?
+### What is the Contamination Score?
+### What is the Speed?
+### Why I am not allowed to submit models more than 15B parameters?
 """

src/leaderboard/read_evals.py CHANGED Viewed

@@ -240,6 +240,7 @@ def get_model_answers_html_file(results_path, model_name):
     model_answers_prefix = f"{results_path}/{model_org}/"
     html_file_content = "EMPTY"
     for root, _, files in os.walk(model_answers_prefix):
@@ -252,6 +253,7 @@ def get_model_answers_html_file(results_path, model_name):
                 with open(file_path, "r") as f:
                     html_file_content = f.read()
                     break
-    return html_file_content

     model_answers_prefix = f"{results_path}/{model_org}/"
     html_file_content = "EMPTY"
+    download_file_path = "https://huggingface.co/spaces/silma-ai/Arabic-LLM-Broad-Leaderboard/raw/main/"
     for root, _, files in os.walk(model_answers_prefix):
                 with open(file_path, "r") as f:
                     html_file_content = f.read()
+                    download_file_path = download_file_path + file_path
                     break
+    return html_file_content,download_file_path

src/submission/submit.py CHANGED Viewed

@@ -71,8 +71,8 @@ def add_new_eval(
     model_size = get_model_size(model_info=model_info)#, precision=precision
-    if model_size>30:
-        return styled_error("Due to limited GPU availability, evaluations for models larger than 30B are currently not automated. Please open a ticket here so we do it manually for you. https://huggingface.co/spaces/silma-ai/Arabic-Broad-Leaderboard/discussions")
     # Were the model card and license filled?
     try:
@@ -137,7 +137,7 @@ def add_new_eval(
     if queue_len == 0:
         queue_data = []
-    elif queue_len >= 2:
         return styled_warning("The evaluation queue is full at the moment. Please try again in one hour")
     queue_data.append(eval_entry)
@@ -172,5 +172,5 @@ def add_new_eval(
     return styled_message(
-        "Thank you for submitting your request! It has been placed in the evaluation queue. You can except the eval to be completed in 1 hour."
     )

     model_size = get_model_size(model_info=model_info)#, precision=precision
+    if model_size>15:
+        return styled_error("Unfortunately we do not accept models above 15B parameters from the community due to limited GPU availability.")
     # Were the model card and license filled?
     try:
     if queue_len == 0:
         queue_data = []
+    elif queue_len >= 1:
         return styled_warning("The evaluation queue is full at the moment. Please try again in one hour")
     queue_data.append(eval_entry)
     return styled_message(
+        "✅ Good news! Your model has been added to the evaluation queue.<br>If you do not see the results after 3 hours then please let us know by opening a community discussion."
     )