feedback #1 applied
Browse files- app.py +14 -10
- src/about.py +10 -7
- src/leaderboard/read_evals.py +3 -1
- src/submission/submit.py +4 -4
app.py
CHANGED
@@ -174,10 +174,10 @@ def get_model_info_blocks(chosen_model_name):
|
|
174 |
filtered_df = LEADERBOARD_DF[LEADERBOARD_DF["Model Name"]==model_name_full].reset_index(drop=True)
|
175 |
skills_bar_df = pd.DataFrame({
|
176 |
'Skills': skills,
|
177 |
-
'
|
178 |
})
|
179 |
|
180 |
-
skills_bar_df = skills_bar_df.sort_values(by=['
|
181 |
|
182 |
def get_metric_html(metric_title):
|
183 |
return f"<div class='deep-dive-metric'><b>{metric_title}</b><span class='ddm-value'>{{}}</div>"
|
@@ -187,7 +187,7 @@ def get_model_info_blocks(chosen_model_name):
|
|
187 |
with gr.Row():
|
188 |
model_name = gr.HTML(get_metric_html("Model Name").format(chosen_model_name))
|
189 |
with gr.Row():
|
190 |
-
benchmark_score = gr.HTML(get_metric_html("Benchmark Score").format(filtered_df["Benchmark Score"][0]))
|
191 |
rank = gr.HTML(get_metric_html("Benchmark Rank").format(filtered_df["Rank"][0]))
|
192 |
speed = gr.HTML(get_metric_html("Speed <br/>(words per second)").format(filtered_df["Speed (words/sec)"][0]))
|
193 |
contamination = gr.HTML(get_metric_html("Contamination Score").format(filtered_df["Contamination Score"][0]))
|
@@ -197,29 +197,33 @@ def get_model_info_blocks(chosen_model_name):
|
|
197 |
skills_bar = gr.BarPlot(
|
198 |
value=skills_bar_df,
|
199 |
x="Skills",
|
200 |
-
y="
|
201 |
width=500,
|
202 |
height=500,
|
203 |
x_label_angle=45,
|
204 |
color="Skills",
|
205 |
color_title=None,
|
206 |
-
label="
|
207 |
sort="-y"
|
208 |
)
|
209 |
|
210 |
|
211 |
-
html_file_content = get_model_answers_html_file(EVAL_RESULTS_PATH, chosen_model_name)
|
212 |
-
|
213 |
if html_file_content == "EMPTY":
|
214 |
answers_html = gr.Markdown("")
|
215 |
else:
|
|
|
|
|
|
|
|
|
216 |
with gr.Row():
|
217 |
|
218 |
##strip style and script tags from html
|
219 |
html_file_content = re.sub('<style.*?>.*?</style>', '', html_file_content, flags=re.DOTALL)
|
220 |
html_file_content = re.sub('<script.*?>.*?</script>', '', html_file_content, flags=re.DOTALL)
|
221 |
html_file_content = html_file_content.replace('<html lang="ar" dir="rtl">','<html>')
|
222 |
-
|
223 |
answers_html = gr.HTML(html_file_content,max_height=500,show_label=True,
|
224 |
label="Model Responses", container=True, elem_classes="model_responses_container")
|
225 |
|
@@ -234,7 +238,7 @@ def init_compare_tab(dataframe):
|
|
234 |
model_names = dataframe["Model Name"].unique().tolist()
|
235 |
model_names_clean = [strip_html_tags(model_name) for model_name in model_names]
|
236 |
with gr.Row():
|
237 |
-
models_dropdown = gr.Dropdown(choices=model_names_clean, label="Select
|
238 |
value=model_names_clean[0], multiselect=True)
|
239 |
|
240 |
|
@@ -323,7 +327,7 @@ with demo:
|
|
323 |
prereqs_checkboxes = gr.CheckboxGroup(["I have successfully run the ABB benchmark script on my model using my own infrastructure and I am NOT using the Leaderboard for testing purposes",
|
324 |
"I understand that my account/org have only one submission per month",
|
325 |
"I understand that I can't submit models more than 15B parameters (learn more in the FAQ)",
|
326 |
-
"I understand that submitting contaminated models or models to test the contamination score will lead to action from our side including banning
|
327 |
label=None, info=None,
|
328 |
elem_classes="submit_prereq_checkboxes_container",
|
329 |
container=False)
|
|
|
174 |
filtered_df = LEADERBOARD_DF[LEADERBOARD_DF["Model Name"]==model_name_full].reset_index(drop=True)
|
175 |
skills_bar_df = pd.DataFrame({
|
176 |
'Skills': skills,
|
177 |
+
'Benchmark Score': filtered_df[skills].values[0]
|
178 |
})
|
179 |
|
180 |
+
skills_bar_df = skills_bar_df.sort_values(by=['Benchmark Score'], ascending=False).reset_index(drop=True)
|
181 |
|
182 |
def get_metric_html(metric_title):
|
183 |
return f"<div class='deep-dive-metric'><b>{metric_title}</b><span class='ddm-value'>{{}}</div>"
|
|
|
187 |
with gr.Row():
|
188 |
model_name = gr.HTML(get_metric_html("Model Name").format(chosen_model_name))
|
189 |
with gr.Row():
|
190 |
+
benchmark_score = gr.HTML(get_metric_html("Benchmark Score").format(str(filtered_df["Benchmark Score"][0])+"/10"))
|
191 |
rank = gr.HTML(get_metric_html("Benchmark Rank").format(filtered_df["Rank"][0]))
|
192 |
speed = gr.HTML(get_metric_html("Speed <br/>(words per second)").format(filtered_df["Speed (words/sec)"][0]))
|
193 |
contamination = gr.HTML(get_metric_html("Contamination Score").format(filtered_df["Contamination Score"][0]))
|
|
|
197 |
skills_bar = gr.BarPlot(
|
198 |
value=skills_bar_df,
|
199 |
x="Skills",
|
200 |
+
y="Benchmark Score",
|
201 |
width=500,
|
202 |
height=500,
|
203 |
x_label_angle=45,
|
204 |
color="Skills",
|
205 |
color_title=None,
|
206 |
+
label=f"{chosen_model_name} model skills",
|
207 |
sort="-y"
|
208 |
)
|
209 |
|
210 |
|
211 |
+
html_file_content,download_file_path = get_model_answers_html_file(EVAL_RESULTS_PATH, chosen_model_name)
|
212 |
+
print(download_file_path)
|
213 |
if html_file_content == "EMPTY":
|
214 |
answers_html = gr.Markdown("")
|
215 |
else:
|
216 |
+
with gr.Row():
|
217 |
+
gr.Markdown(f"""
|
218 |
+
<a href='{download_file_path}' target='_blank'>Download model answers here</a>
|
219 |
+
""")
|
220 |
with gr.Row():
|
221 |
|
222 |
##strip style and script tags from html
|
223 |
html_file_content = re.sub('<style.*?>.*?</style>', '', html_file_content, flags=re.DOTALL)
|
224 |
html_file_content = re.sub('<script.*?>.*?</script>', '', html_file_content, flags=re.DOTALL)
|
225 |
html_file_content = html_file_content.replace('<html lang="ar" dir="rtl">','<html>')
|
226 |
+
|
227 |
answers_html = gr.HTML(html_file_content,max_height=500,show_label=True,
|
228 |
label="Model Responses", container=True, elem_classes="model_responses_container")
|
229 |
|
|
|
238 |
model_names = dataframe["Model Name"].unique().tolist()
|
239 |
model_names_clean = [strip_html_tags(model_name) for model_name in model_names]
|
240 |
with gr.Row():
|
241 |
+
models_dropdown = gr.Dropdown(choices=model_names_clean, label="Select Models",
|
242 |
value=model_names_clean[0], multiselect=True)
|
243 |
|
244 |
|
|
|
327 |
prereqs_checkboxes = gr.CheckboxGroup(["I have successfully run the ABB benchmark script on my model using my own infrastructure and I am NOT using the Leaderboard for testing purposes",
|
328 |
"I understand that my account/org have only one submission per month",
|
329 |
"I understand that I can't submit models more than 15B parameters (learn more in the FAQ)",
|
330 |
+
"I understand that submitting contaminated models or models to test the contamination score will lead to action from our side including banning. We also reserve the right to delete any model we think is contaminated without notice."],
|
331 |
label=None, info=None,
|
332 |
elem_classes="submit_prereq_checkboxes_container",
|
333 |
container=False)
|
src/about.py
CHANGED
@@ -67,28 +67,31 @@ Find more details in the about Tab.
|
|
67 |
|
68 |
# Which evaluations are you running? how can people reproduce what you have?
|
69 |
LLM_BENCHMARKS_TEXT = f"""
|
70 |
-
|
|
|
|
|
|
|
71 |
|
72 |
ABL is the Leaderboard which uses ABB benchmarking dataset and code in the backend to produce the results you see here
|
73 |
|
74 |
|
75 |
-
|
76 |
|
77 |
Feel free to read the following resources
|
78 |
ABB Page:
|
79 |
ABL blog post:
|
80 |
|
81 |
-
|
82 |
|
83 |
You can easily run the ABB benchmarking code using the following command on Google Collab or your own infratructure.
|
84 |
|
85 |
-
|
86 |
|
87 |
-
|
88 |
|
89 |
-
|
90 |
|
91 |
-
|
92 |
|
93 |
|
94 |
"""
|
|
|
67 |
|
68 |
# Which evaluations are you running? how can people reproduce what you have?
|
69 |
LLM_BENCHMARKS_TEXT = f"""
|
70 |
+
|
71 |
+
## FAQ
|
72 |
+
|
73 |
+
### What is the difference betweem ABL and ABB?
|
74 |
|
75 |
ABL is the Leaderboard which uses ABB benchmarking dataset and code in the backend to produce the results you see here
|
76 |
|
77 |
|
78 |
+
### What can I learn more about ABL and ABB?
|
79 |
|
80 |
Feel free to read the following resources
|
81 |
ABB Page:
|
82 |
ABL blog post:
|
83 |
|
84 |
+
### How can I reproduce the results?
|
85 |
|
86 |
You can easily run the ABB benchmarking code using the following command on Google Collab or your own infratructure.
|
87 |
|
88 |
+
### What is the Benchmark Score?
|
89 |
|
90 |
+
### What is the Contamination Score?
|
91 |
|
92 |
+
### What is the Speed?
|
93 |
|
94 |
+
### Why I am not allowed to submit models more than 15B parameters?
|
95 |
|
96 |
|
97 |
"""
|
src/leaderboard/read_evals.py
CHANGED
@@ -240,6 +240,7 @@ def get_model_answers_html_file(results_path, model_name):
|
|
240 |
model_answers_prefix = f"{results_path}/{model_org}/"
|
241 |
|
242 |
html_file_content = "EMPTY"
|
|
|
243 |
|
244 |
for root, _, files in os.walk(model_answers_prefix):
|
245 |
|
@@ -252,6 +253,7 @@ def get_model_answers_html_file(results_path, model_name):
|
|
252 |
with open(file_path, "r") as f:
|
253 |
|
254 |
html_file_content = f.read()
|
|
|
255 |
break
|
256 |
|
257 |
-
return html_file_content
|
|
|
240 |
model_answers_prefix = f"{results_path}/{model_org}/"
|
241 |
|
242 |
html_file_content = "EMPTY"
|
243 |
+
download_file_path = "https://huggingface.co/spaces/silma-ai/Arabic-LLM-Broad-Leaderboard/raw/main/"
|
244 |
|
245 |
for root, _, files in os.walk(model_answers_prefix):
|
246 |
|
|
|
253 |
with open(file_path, "r") as f:
|
254 |
|
255 |
html_file_content = f.read()
|
256 |
+
download_file_path = download_file_path + file_path
|
257 |
break
|
258 |
|
259 |
+
return html_file_content,download_file_path
|
src/submission/submit.py
CHANGED
@@ -71,8 +71,8 @@ def add_new_eval(
|
|
71 |
|
72 |
model_size = get_model_size(model_info=model_info)#, precision=precision
|
73 |
|
74 |
-
if model_size>
|
75 |
-
return styled_error("
|
76 |
|
77 |
# Were the model card and license filled?
|
78 |
try:
|
@@ -137,7 +137,7 @@ def add_new_eval(
|
|
137 |
|
138 |
if queue_len == 0:
|
139 |
queue_data = []
|
140 |
-
elif queue_len >=
|
141 |
return styled_warning("The evaluation queue is full at the moment. Please try again in one hour")
|
142 |
|
143 |
queue_data.append(eval_entry)
|
@@ -172,5 +172,5 @@ def add_new_eval(
|
|
172 |
|
173 |
|
174 |
return styled_message(
|
175 |
-
"
|
176 |
)
|
|
|
71 |
|
72 |
model_size = get_model_size(model_info=model_info)#, precision=precision
|
73 |
|
74 |
+
if model_size>15:
|
75 |
+
return styled_error("Unfortunately we do not accept models above 15B parameters from the community due to limited GPU availability.")
|
76 |
|
77 |
# Were the model card and license filled?
|
78 |
try:
|
|
|
137 |
|
138 |
if queue_len == 0:
|
139 |
queue_data = []
|
140 |
+
elif queue_len >= 1:
|
141 |
return styled_warning("The evaluation queue is full at the moment. Please try again in one hour")
|
142 |
|
143 |
queue_data.append(eval_entry)
|
|
|
172 |
|
173 |
|
174 |
return styled_message(
|
175 |
+
"✅ Good news! Your model has been added to the evaluation queue.<br>If you do not see the results after 3 hours then please let us know by opening a community discussion."
|
176 |
)
|