karimouda commited on
Commit
2fe1d39
·
1 Parent(s): 9af0c0a

feedback #1 applied

Browse files
app.py CHANGED
@@ -174,10 +174,10 @@ def get_model_info_blocks(chosen_model_name):
174
  filtered_df = LEADERBOARD_DF[LEADERBOARD_DF["Model Name"]==model_name_full].reset_index(drop=True)
175
  skills_bar_df = pd.DataFrame({
176
  'Skills': skills,
177
- 'Scores': filtered_df[skills].values[0]
178
  })
179
 
180
- skills_bar_df = skills_bar_df.sort_values(by=['Scores'], ascending=False).reset_index(drop=True)
181
 
182
  def get_metric_html(metric_title):
183
  return f"<div class='deep-dive-metric'><b>{metric_title}</b><span class='ddm-value'>{{}}</div>"
@@ -187,7 +187,7 @@ def get_model_info_blocks(chosen_model_name):
187
  with gr.Row():
188
  model_name = gr.HTML(get_metric_html("Model Name").format(chosen_model_name))
189
  with gr.Row():
190
- benchmark_score = gr.HTML(get_metric_html("Benchmark Score").format(filtered_df["Benchmark Score"][0]))
191
  rank = gr.HTML(get_metric_html("Benchmark Rank").format(filtered_df["Rank"][0]))
192
  speed = gr.HTML(get_metric_html("Speed <br/>(words per second)").format(filtered_df["Speed (words/sec)"][0]))
193
  contamination = gr.HTML(get_metric_html("Contamination Score").format(filtered_df["Contamination Score"][0]))
@@ -197,29 +197,33 @@ def get_model_info_blocks(chosen_model_name):
197
  skills_bar = gr.BarPlot(
198
  value=skills_bar_df,
199
  x="Skills",
200
- y="Scores",
201
  width=500,
202
  height=500,
203
  x_label_angle=45,
204
  color="Skills",
205
  color_title=None,
206
- label="Model Skills",
207
  sort="-y"
208
  )
209
 
210
 
211
- html_file_content = get_model_answers_html_file(EVAL_RESULTS_PATH, chosen_model_name)
212
-
213
  if html_file_content == "EMPTY":
214
  answers_html = gr.Markdown("")
215
  else:
 
 
 
 
216
  with gr.Row():
217
 
218
  ##strip style and script tags from html
219
  html_file_content = re.sub('<style.*?>.*?</style>', '', html_file_content, flags=re.DOTALL)
220
  html_file_content = re.sub('<script.*?>.*?</script>', '', html_file_content, flags=re.DOTALL)
221
  html_file_content = html_file_content.replace('<html lang="ar" dir="rtl">','<html>')
222
-
223
  answers_html = gr.HTML(html_file_content,max_height=500,show_label=True,
224
  label="Model Responses", container=True, elem_classes="model_responses_container")
225
 
@@ -234,7 +238,7 @@ def init_compare_tab(dataframe):
234
  model_names = dataframe["Model Name"].unique().tolist()
235
  model_names_clean = [strip_html_tags(model_name) for model_name in model_names]
236
  with gr.Row():
237
- models_dropdown = gr.Dropdown(choices=model_names_clean, label="Select Model",
238
  value=model_names_clean[0], multiselect=True)
239
 
240
 
@@ -323,7 +327,7 @@ with demo:
323
  prereqs_checkboxes = gr.CheckboxGroup(["I have successfully run the ABB benchmark script on my model using my own infrastructure and I am NOT using the Leaderboard for testing purposes",
324
  "I understand that my account/org have only one submission per month",
325
  "I understand that I can't submit models more than 15B parameters (learn more in the FAQ)",
326
- "I understand that submitting contaminated models or models to test the contamination score will lead to action from our side including banning and public sharing of the incident"],
327
  label=None, info=None,
328
  elem_classes="submit_prereq_checkboxes_container",
329
  container=False)
 
174
  filtered_df = LEADERBOARD_DF[LEADERBOARD_DF["Model Name"]==model_name_full].reset_index(drop=True)
175
  skills_bar_df = pd.DataFrame({
176
  'Skills': skills,
177
+ 'Benchmark Score': filtered_df[skills].values[0]
178
  })
179
 
180
+ skills_bar_df = skills_bar_df.sort_values(by=['Benchmark Score'], ascending=False).reset_index(drop=True)
181
 
182
  def get_metric_html(metric_title):
183
  return f"<div class='deep-dive-metric'><b>{metric_title}</b><span class='ddm-value'>{{}}</div>"
 
187
  with gr.Row():
188
  model_name = gr.HTML(get_metric_html("Model Name").format(chosen_model_name))
189
  with gr.Row():
190
+ benchmark_score = gr.HTML(get_metric_html("Benchmark Score").format(str(filtered_df["Benchmark Score"][0])+"/10"))
191
  rank = gr.HTML(get_metric_html("Benchmark Rank").format(filtered_df["Rank"][0]))
192
  speed = gr.HTML(get_metric_html("Speed <br/>(words per second)").format(filtered_df["Speed (words/sec)"][0]))
193
  contamination = gr.HTML(get_metric_html("Contamination Score").format(filtered_df["Contamination Score"][0]))
 
197
  skills_bar = gr.BarPlot(
198
  value=skills_bar_df,
199
  x="Skills",
200
+ y="Benchmark Score",
201
  width=500,
202
  height=500,
203
  x_label_angle=45,
204
  color="Skills",
205
  color_title=None,
206
+ label=f"{chosen_model_name} model skills",
207
  sort="-y"
208
  )
209
 
210
 
211
+ html_file_content,download_file_path = get_model_answers_html_file(EVAL_RESULTS_PATH, chosen_model_name)
212
+ print(download_file_path)
213
  if html_file_content == "EMPTY":
214
  answers_html = gr.Markdown("")
215
  else:
216
+ with gr.Row():
217
+ gr.Markdown(f"""
218
+ <a href='{download_file_path}' target='_blank'>Download model answers here</a>
219
+ """)
220
  with gr.Row():
221
 
222
  ##strip style and script tags from html
223
  html_file_content = re.sub('<style.*?>.*?</style>', '', html_file_content, flags=re.DOTALL)
224
  html_file_content = re.sub('<script.*?>.*?</script>', '', html_file_content, flags=re.DOTALL)
225
  html_file_content = html_file_content.replace('<html lang="ar" dir="rtl">','<html>')
226
+
227
  answers_html = gr.HTML(html_file_content,max_height=500,show_label=True,
228
  label="Model Responses", container=True, elem_classes="model_responses_container")
229
 
 
238
  model_names = dataframe["Model Name"].unique().tolist()
239
  model_names_clean = [strip_html_tags(model_name) for model_name in model_names]
240
  with gr.Row():
241
+ models_dropdown = gr.Dropdown(choices=model_names_clean, label="Select Models",
242
  value=model_names_clean[0], multiselect=True)
243
 
244
 
 
327
  prereqs_checkboxes = gr.CheckboxGroup(["I have successfully run the ABB benchmark script on my model using my own infrastructure and I am NOT using the Leaderboard for testing purposes",
328
  "I understand that my account/org have only one submission per month",
329
  "I understand that I can't submit models more than 15B parameters (learn more in the FAQ)",
330
+ "I understand that submitting contaminated models or models to test the contamination score will lead to action from our side including banning. We also reserve the right to delete any model we think is contaminated without notice."],
331
  label=None, info=None,
332
  elem_classes="submit_prereq_checkboxes_container",
333
  container=False)
src/about.py CHANGED
@@ -67,28 +67,31 @@ Find more details in the about Tab.
67
 
68
  # Which evaluations are you running? how can people reproduce what you have?
69
  LLM_BENCHMARKS_TEXT = f"""
70
- ## What is the difference betweem ABL and ABB?
 
 
 
71
 
72
  ABL is the Leaderboard which uses ABB benchmarking dataset and code in the backend to produce the results you see here
73
 
74
 
75
- ## What can I learn more about ABL and ABB?
76
 
77
  Feel free to read the following resources
78
  ABB Page:
79
  ABL blog post:
80
 
81
- ## How can I reproduce the results?
82
 
83
  You can easily run the ABB benchmarking code using the following command on Google Collab or your own infratructure.
84
 
85
- ## What is the Benchmark Score?
86
 
87
- ## What is the Contamination Score?
88
 
89
- ## What is the Speed?
90
 
91
- ## Why I am not allowed to submit models more than 15B parameters?
92
 
93
 
94
  """
 
67
 
68
  # Which evaluations are you running? how can people reproduce what you have?
69
  LLM_BENCHMARKS_TEXT = f"""
70
+
71
+ ## FAQ
72
+
73
+ ### What is the difference betweem ABL and ABB?
74
 
75
  ABL is the Leaderboard which uses ABB benchmarking dataset and code in the backend to produce the results you see here
76
 
77
 
78
+ ### What can I learn more about ABL and ABB?
79
 
80
  Feel free to read the following resources
81
  ABB Page:
82
  ABL blog post:
83
 
84
+ ### How can I reproduce the results?
85
 
86
  You can easily run the ABB benchmarking code using the following command on Google Collab or your own infratructure.
87
 
88
+ ### What is the Benchmark Score?
89
 
90
+ ### What is the Contamination Score?
91
 
92
+ ### What is the Speed?
93
 
94
+ ### Why I am not allowed to submit models more than 15B parameters?
95
 
96
 
97
  """
src/leaderboard/read_evals.py CHANGED
@@ -240,6 +240,7 @@ def get_model_answers_html_file(results_path, model_name):
240
  model_answers_prefix = f"{results_path}/{model_org}/"
241
 
242
  html_file_content = "EMPTY"
 
243
 
244
  for root, _, files in os.walk(model_answers_prefix):
245
 
@@ -252,6 +253,7 @@ def get_model_answers_html_file(results_path, model_name):
252
  with open(file_path, "r") as f:
253
 
254
  html_file_content = f.read()
 
255
  break
256
 
257
- return html_file_content
 
240
  model_answers_prefix = f"{results_path}/{model_org}/"
241
 
242
  html_file_content = "EMPTY"
243
+ download_file_path = "https://huggingface.co/spaces/silma-ai/Arabic-LLM-Broad-Leaderboard/raw/main/"
244
 
245
  for root, _, files in os.walk(model_answers_prefix):
246
 
 
253
  with open(file_path, "r") as f:
254
 
255
  html_file_content = f.read()
256
+ download_file_path = download_file_path + file_path
257
  break
258
 
259
+ return html_file_content,download_file_path
src/submission/submit.py CHANGED
@@ -71,8 +71,8 @@ def add_new_eval(
71
 
72
  model_size = get_model_size(model_info=model_info)#, precision=precision
73
 
74
- if model_size>30:
75
- return styled_error("Due to limited GPU availability, evaluations for models larger than 30B are currently not automated. Please open a ticket here so we do it manually for you. https://huggingface.co/spaces/silma-ai/Arabic-Broad-Leaderboard/discussions")
76
 
77
  # Were the model card and license filled?
78
  try:
@@ -137,7 +137,7 @@ def add_new_eval(
137
 
138
  if queue_len == 0:
139
  queue_data = []
140
- elif queue_len >= 2:
141
  return styled_warning("The evaluation queue is full at the moment. Please try again in one hour")
142
 
143
  queue_data.append(eval_entry)
@@ -172,5 +172,5 @@ def add_new_eval(
172
 
173
 
174
  return styled_message(
175
- "Thank you for submitting your request! It has been placed in the evaluation queue. You can except the eval to be completed in 1 hour."
176
  )
 
71
 
72
  model_size = get_model_size(model_info=model_info)#, precision=precision
73
 
74
+ if model_size>15:
75
+ return styled_error("Unfortunately we do not accept models above 15B parameters from the community due to limited GPU availability.")
76
 
77
  # Were the model card and license filled?
78
  try:
 
137
 
138
  if queue_len == 0:
139
  queue_data = []
140
+ elif queue_len >= 1:
141
  return styled_warning("The evaluation queue is full at the moment. Please try again in one hour")
142
 
143
  queue_data.append(eval_entry)
 
172
 
173
 
174
  return styled_message(
175
+ " Good news! Your model has been added to the evaluation queue.<br>If you do not see the results after 3 hours then please let us know by opening a community discussion."
176
  )