Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
eduagarcia
commited on
Commit
·
359d8a9
1
Parent(s):
ebb5810
Evaluation time metric and plot
Browse files- app.py +13 -2
- src/display/utils.py +8 -3
- src/leaderboard/filter_models.py +2 -2
- src/leaderboard/read_evals.py +5 -2
- src/tools/plots.py +55 -0
app.py
CHANGED
@@ -38,6 +38,7 @@ from src.tools.plots import (
|
|
38 |
create_metric_plot_obj,
|
39 |
create_plot_df,
|
40 |
create_scores_df,
|
|
|
41 |
)
|
42 |
|
43 |
# Start ephemeral Spaces on PRs (see config in README.md)
|
@@ -344,7 +345,7 @@ with demo:
|
|
344 |
queue=True,
|
345 |
)
|
346 |
|
347 |
-
with gr.TabItem("📈 Metrics
|
348 |
with gr.Row():
|
349 |
with gr.Column():
|
350 |
chart = create_metric_plot_obj(
|
@@ -359,7 +360,17 @@ with demo:
|
|
359 |
BENCHMARK_COLS,
|
360 |
title="Top Scores and Human Baseline Over Time (from last update)",
|
361 |
)
|
362 |
-
gr.Plot(value=chart, min_width=500)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
363 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
364 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
365 |
gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
|
|
38 |
create_metric_plot_obj,
|
39 |
create_plot_df,
|
40 |
create_scores_df,
|
41 |
+
create_lat_score_mem_plot_obj
|
42 |
)
|
43 |
|
44 |
# Start ephemeral Spaces on PRs (see config in README.md)
|
|
|
345 |
queue=True,
|
346 |
)
|
347 |
|
348 |
+
with gr.TabItem("📈 Metrics", elem_id="llm-benchmark-tab-table", id=4):
|
349 |
with gr.Row():
|
350 |
with gr.Column():
|
351 |
chart = create_metric_plot_obj(
|
|
|
360 |
BENCHMARK_COLS,
|
361 |
title="Top Scores and Human Baseline Over Time (from last update)",
|
362 |
)
|
363 |
+
gr.Plot(value=chart, min_width=500)
|
364 |
+
with gr.Row():
|
365 |
+
with gr.Column():
|
366 |
+
fig = create_lat_score_mem_plot_obj(leaderboard_df)
|
367 |
+
plot = gr.components.Plot(
|
368 |
+
value=fig,
|
369 |
+
elem_id="plot",
|
370 |
+
show_label=False,
|
371 |
+
)
|
372 |
+
gr.HTML("👆 Hover over the points 👆 for additional information. ",elem_id="text")
|
373 |
+
gr.HTML('This plot the Evaluation Time from our backend GPU (Nvdia A100-80G) to run all the benchmarks, it\'s not a very precise performance benchmark of the models, for that look for the <a href="https://huggingface.co/spaces/optimum/llm-perf-leaderboard" target="_blank">🤗 LLM-Perf Leaderboard</a>',elem_id="text")
|
374 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
375 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
376 |
gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
src/display/utils.py
CHANGED
@@ -109,8 +109,11 @@ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Avai
|
|
109 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
110 |
auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
|
111 |
auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
|
|
|
112 |
# Dummy column for the search bar (hidden by the custom CSS)
|
113 |
-
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("
|
|
|
|
|
114 |
|
115 |
# We use make dataclass to dynamically fill the scores from Tasks
|
116 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
@@ -147,7 +150,8 @@ baseline_row = {
|
|
147 |
AutoEvalColumn.likes.name: 0,
|
148 |
AutoEvalColumn.license.name: "",
|
149 |
AutoEvalColumn.still_on_hub.name: False,
|
150 |
-
AutoEvalColumn.moe.name: False
|
|
|
151 |
}
|
152 |
|
153 |
baseline_list = []
|
@@ -187,7 +191,8 @@ human_baseline_row = {
|
|
187 |
AutoEvalColumn.likes.name: 0,
|
188 |
AutoEvalColumn.license.name: "",
|
189 |
AutoEvalColumn.still_on_hub.name: False,
|
190 |
-
AutoEvalColumn.moe.name: False
|
|
|
191 |
}
|
192 |
|
193 |
baseline_list = []
|
|
|
109 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
110 |
auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
|
111 |
auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
|
112 |
+
auto_eval_column_dict.append(["eval_time", ColumnContent, ColumnContent("Evaluation Time (s)", "number", False)])
|
113 |
# Dummy column for the search bar (hidden by the custom CSS)
|
114 |
+
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("Model Name", "str", False, dummy=True)])
|
115 |
+
|
116 |
+
|
117 |
|
118 |
# We use make dataclass to dynamically fill the scores from Tasks
|
119 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
150 |
AutoEvalColumn.likes.name: 0,
|
151 |
AutoEvalColumn.license.name: "",
|
152 |
AutoEvalColumn.still_on_hub.name: False,
|
153 |
+
AutoEvalColumn.moe.name: False,
|
154 |
+
AutoEvalColumn.eval_time.name: 0.0
|
155 |
}
|
156 |
|
157 |
baseline_list = []
|
|
|
191 |
AutoEvalColumn.likes.name: 0,
|
192 |
AutoEvalColumn.license.name: "",
|
193 |
AutoEvalColumn.still_on_hub.name: False,
|
194 |
+
AutoEvalColumn.moe.name: False,
|
195 |
+
AutoEvalColumn.eval_time.name: 0.0
|
196 |
}
|
197 |
|
198 |
baseline_list = []
|
src/leaderboard/filter_models.py
CHANGED
@@ -99,7 +99,7 @@ def flag_models(leaderboard_data: list[dict]):
|
|
99 |
if model_data[AutoEvalColumn.flagged.name] == True:
|
100 |
flag_key = "merged"
|
101 |
else:
|
102 |
-
flag_key = model_data[
|
103 |
|
104 |
if flag_key in FLAGGED_MODELS:
|
105 |
issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
|
@@ -118,7 +118,7 @@ def flag_models(leaderboard_data: list[dict]):
|
|
118 |
def remove_forbidden_models(leaderboard_data: list[dict]):
|
119 |
indices_to_remove = []
|
120 |
for ix, model in enumerate(leaderboard_data):
|
121 |
-
if model[
|
122 |
indices_to_remove.append(ix)
|
123 |
|
124 |
for ix in reversed(indices_to_remove):
|
|
|
99 |
if model_data[AutoEvalColumn.flagged.name] == True:
|
100 |
flag_key = "merged"
|
101 |
else:
|
102 |
+
flag_key = model_data[AutoEvalColumn.dummy.name]
|
103 |
|
104 |
if flag_key in FLAGGED_MODELS:
|
105 |
issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
|
|
|
118 |
def remove_forbidden_models(leaderboard_data: list[dict]):
|
119 |
indices_to_remove = []
|
120 |
for ix, model in enumerate(leaderboard_data):
|
121 |
+
if model[AutoEvalColumn.dummy.name] in DO_NOT_SUBMIT_MODELS:
|
122 |
indices_to_remove.append(ix)
|
123 |
|
124 |
for ix in reversed(indices_to_remove):
|
src/leaderboard/read_evals.py
CHANGED
@@ -36,6 +36,7 @@ class EvalResult:
|
|
36 |
status: str = "FINISHED"
|
37 |
tags: list = None
|
38 |
json_filename: str = None
|
|
|
39 |
|
40 |
@classmethod
|
41 |
def init_from_json_file(self, json_filepath):
|
@@ -103,7 +104,8 @@ class EvalResult:
|
|
103 |
results=results,
|
104 |
precision=precision,
|
105 |
revision= config.get("model_sha", ""),
|
106 |
-
json_filename=json_filename
|
|
|
107 |
)
|
108 |
|
109 |
def update_with_request_file(self, requests_path):
|
@@ -151,7 +153,8 @@ class EvalResult:
|
|
151 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
152 |
AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
|
153 |
AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
|
154 |
-
AutoEvalColumn.flagged.name: self.flagged
|
|
|
155 |
}
|
156 |
|
157 |
for task in Tasks:
|
|
|
36 |
status: str = "FINISHED"
|
37 |
tags: list = None
|
38 |
json_filename: str = None
|
39 |
+
eval_time: float = 0.0
|
40 |
|
41 |
@classmethod
|
42 |
def init_from_json_file(self, json_filepath):
|
|
|
104 |
results=results,
|
105 |
precision=precision,
|
106 |
revision= config.get("model_sha", ""),
|
107 |
+
json_filename=json_filename,
|
108 |
+
eval_time=config.get("total_evaluation_time_seconds", 0.0)
|
109 |
)
|
110 |
|
111 |
def update_with_request_file(self, requests_path):
|
|
|
153 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
154 |
AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
|
155 |
AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
|
156 |
+
AutoEvalColumn.flagged.name: self.flagged,
|
157 |
+
AutoEvalColumn.eval_time.name: self.eval_time,
|
158 |
}
|
159 |
|
160 |
for task in Tasks:
|
src/tools/plots.py
CHANGED
@@ -151,6 +151,61 @@ def create_metric_plot_obj(
|
|
151 |
|
152 |
return fig
|
153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
# Example Usage:
|
156 |
# human_baselines dictionary is defined.
|
|
|
151 |
|
152 |
return fig
|
153 |
|
154 |
+
def create_lat_score_mem_plot_obj(leaderboard_df):
|
155 |
+
copy_df = leaderboard_df.copy()
|
156 |
+
copy_df = copy_df[~(copy_df[AutoEvalColumn.dummy.name].isin(["baseline", "human_baseline"]))]
|
157 |
+
# plot
|
158 |
+
SCORE_MEMORY_LATENCY_DATA = [
|
159 |
+
AutoEvalColumn.dummy.name,
|
160 |
+
AutoEvalColumn.average.name,
|
161 |
+
AutoEvalColumn.params.name,
|
162 |
+
AutoEvalColumn.architecture.name,
|
163 |
+
"Evaluation Time (min)"
|
164 |
+
]
|
165 |
+
|
166 |
+
copy_df["LLM Average Score"] = copy_df[AutoEvalColumn.average.name]
|
167 |
+
copy_df["Evaluation Time (min)"] = copy_df[AutoEvalColumn.eval_time.name] / 60
|
168 |
+
|
169 |
+
#copy_df["size"] = copy_df[AutoEvalColumn.params.name]
|
170 |
+
copy_df["size"] = copy_df[AutoEvalColumn.params.name].apply(lambda x: 0.5 if 0 <= x < 0.8 else x)
|
171 |
+
copy_df["size"] = copy_df["size"].apply(lambda x: 0.8 if 0.8 <= x < 2 else x)
|
172 |
+
copy_df["size"] = copy_df["size"].apply(lambda x: 1.5 if 2 <= x < 5 else x)
|
173 |
+
copy_df["size"] = copy_df["size"].apply(lambda x: 2.0 if 5 <= x < 10 else x)
|
174 |
+
copy_df["size"] = copy_df["size"].apply(lambda x: 3.0 if 10 <= x < 20 else x)
|
175 |
+
copy_df["size"] = copy_df["size"].apply(lambda x: 4.5 if 20 <= x < 40 else x)
|
176 |
+
copy_df["size"] = copy_df["size"].apply(lambda x: 7.0 if x > 40 else x)
|
177 |
+
|
178 |
+
fig = px.scatter(
|
179 |
+
copy_df,
|
180 |
+
x="Evaluation Time (min)",
|
181 |
+
y="LLM Average Score",
|
182 |
+
size="size",
|
183 |
+
color=AutoEvalColumn.architecture.name,
|
184 |
+
custom_data=SCORE_MEMORY_LATENCY_DATA,
|
185 |
+
color_discrete_sequence=px.colors.qualitative.Light24,
|
186 |
+
log_x=True
|
187 |
+
)
|
188 |
+
fig.update_traces(
|
189 |
+
hovertemplate="<br>".join(
|
190 |
+
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(SCORE_MEMORY_LATENCY_DATA)]
|
191 |
+
)
|
192 |
+
)
|
193 |
+
fig.update_layout(
|
194 |
+
title={
|
195 |
+
"text": "Eval Time vs. Score vs. #Params",
|
196 |
+
"y": 0.95,
|
197 |
+
"x": 0.5,
|
198 |
+
"xanchor": "center",
|
199 |
+
"yanchor": "top",
|
200 |
+
},
|
201 |
+
xaxis_title="Time To Evaluate (min)",
|
202 |
+
yaxis_title="LLM Average Score",
|
203 |
+
legend_title="LLM Architecture",
|
204 |
+
width=1200,
|
205 |
+
height=600,
|
206 |
+
)
|
207 |
+
|
208 |
+
return fig
|
209 |
|
210 |
# Example Usage:
|
211 |
# human_baselines dictionary is defined.
|