Spaces:
Running
Running
add filter: Use Case Area
Browse files- app.py +98 -30
- crm-results/hf_leaderboard_latency_cost.csv +37 -0
- src/display/utils.py +2 -5
- src/populate.py +2 -0
app.py
CHANGED
@@ -34,6 +34,7 @@ def update_table(
|
|
34 |
llm_query: list,
|
35 |
llm_provider_query: list,
|
36 |
accuracy_method_query: str,
|
|
|
37 |
use_case_query: list,
|
38 |
use_case_type_query: list,
|
39 |
# type_query: list,
|
@@ -49,20 +50,49 @@ def update_table(
|
|
49 |
filtered_df = filter_accuracy_method_func(filtered_df, accuracy_method_query)
|
50 |
|
51 |
filtered_df["Use Case Area"] = filtered_df["Use Case Name"].apply(lambda x: x.split(": ")[0])
|
52 |
-
|
53 |
filtered_df = filter_use_case_func(filtered_df, use_case_query)
|
54 |
filtered_df = filter_use_case_type_func(filtered_df, use_case_type_query)
|
55 |
df = select_columns(filtered_df, columns)
|
56 |
return df
|
57 |
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
def filter_accuracy_method_func(df: pd.DataFrame, accuracy_method_query: str) -> pd.DataFrame:
|
60 |
return df[df["Accuracy Method"] == accuracy_method_query]
|
61 |
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
def filter_use_case_func(df: pd.DataFrame, use_case_query: list) -> pd.DataFrame:
|
64 |
-
# print(use_case_query)
|
65 |
-
# print(df[df["Use Case Name"].isin(["Service: Conversation summary"])])
|
66 |
return df[df["Use Case Name"].isin(use_case_query)]
|
67 |
|
68 |
|
@@ -170,7 +200,33 @@ with demo:
|
|
170 |
# )
|
171 |
with gr.Row():
|
172 |
with gr.Column():
|
173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
choices=["Service", "Sales"],
|
175 |
value=["Service", "Sales"],
|
176 |
label="Use Case Area",
|
@@ -185,15 +241,15 @@ with demo:
|
|
185 |
info="",
|
186 |
interactive=True,
|
187 |
)
|
188 |
-
with gr.Column():
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
with gr.Column():
|
198 |
filter_metric_area = gr.CheckboxGroup(
|
199 |
choices=["Accuracy", "Speed (Latency)", "Trust & Safety", "Cost"],
|
@@ -217,25 +273,35 @@ with demo:
|
|
217 |
info="Range: 0.0 to 4.0",
|
218 |
interactive=True,
|
219 |
)
|
220 |
-
with gr.Column():
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
with gr.Column():
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
|
237 |
leaderboard_table = gr.components.Dataframe(
|
238 |
-
value=leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
240 |
datatype=TYPES,
|
241 |
elem_id="leaderboard-table",
|
@@ -268,6 +334,7 @@ with demo:
|
|
268 |
filter_llm,
|
269 |
filter_llm_provider,
|
270 |
filter_accuracy_method,
|
|
|
271 |
filter_use_case,
|
272 |
filter_use_case_type,
|
273 |
# filter_columns_type,
|
@@ -283,6 +350,7 @@ with demo:
|
|
283 |
filter_llm,
|
284 |
filter_llm_provider,
|
285 |
filter_accuracy_method,
|
|
|
286 |
filter_use_case,
|
287 |
filter_use_case_type,
|
288 |
# filter_columns_type,
|
|
|
34 |
llm_query: list,
|
35 |
llm_provider_query: list,
|
36 |
accuracy_method_query: str,
|
37 |
+
use_case_area_query: list,
|
38 |
use_case_query: list,
|
39 |
use_case_type_query: list,
|
40 |
# type_query: list,
|
|
|
50 |
filtered_df = filter_accuracy_method_func(filtered_df, accuracy_method_query)
|
51 |
|
52 |
filtered_df["Use Case Area"] = filtered_df["Use Case Name"].apply(lambda x: x.split(": ")[0])
|
53 |
+
filtered_df = filter_use_case_area_func(filtered_df, use_case_area_query)
|
54 |
filtered_df = filter_use_case_func(filtered_df, use_case_query)
|
55 |
filtered_df = filter_use_case_type_func(filtered_df, use_case_type_query)
|
56 |
df = select_columns(filtered_df, columns)
|
57 |
return df
|
58 |
|
59 |
|
60 |
+
def init_leaderboard_df(
|
61 |
+
leaderboard_df: pd.DataFrame,
|
62 |
+
columns: list,
|
63 |
+
llm_query: list,
|
64 |
+
llm_provider_query: list,
|
65 |
+
accuracy_method_query: str,
|
66 |
+
use_case_area_query: list,
|
67 |
+
use_case_query: list,
|
68 |
+
use_case_type_query: list,
|
69 |
+
):
|
70 |
+
return update_table(
|
71 |
+
leaderboard_df,
|
72 |
+
columns,
|
73 |
+
llm_query,
|
74 |
+
llm_provider_query,
|
75 |
+
accuracy_method_query,
|
76 |
+
use_case_area_query,
|
77 |
+
use_case_query,
|
78 |
+
use_case_type_query,
|
79 |
+
)
|
80 |
+
|
81 |
+
|
82 |
def filter_accuracy_method_func(df: pd.DataFrame, accuracy_method_query: str) -> pd.DataFrame:
|
83 |
return df[df["Accuracy Method"] == accuracy_method_query]
|
84 |
|
85 |
|
86 |
+
def filter_use_case_area_func(df: pd.DataFrame, use_case_area_query: list) -> pd.DataFrame:
|
87 |
+
return df[
|
88 |
+
df["Use Case Area"].apply(
|
89 |
+
lambda x: len(set([_.strip() for _ in x.split("&")]).intersection(use_case_area_query))
|
90 |
+
)
|
91 |
+
> 0
|
92 |
+
]
|
93 |
+
|
94 |
+
|
95 |
def filter_use_case_func(df: pd.DataFrame, use_case_query: list) -> pd.DataFrame:
|
|
|
|
|
96 |
return df[df["Use Case Name"].isin(use_case_query)]
|
97 |
|
98 |
|
|
|
200 |
# )
|
201 |
with gr.Row():
|
202 |
with gr.Column():
|
203 |
+
filter_llm = gr.CheckboxGroup(
|
204 |
+
choices=list(original_df["Model Name"].unique()),
|
205 |
+
value=list(original_df["Model Name"].unique()),
|
206 |
+
label="Model Name",
|
207 |
+
info="",
|
208 |
+
interactive=True,
|
209 |
+
)
|
210 |
+
with gr.Column():
|
211 |
+
filter_llm_provider = gr.CheckboxGroup(
|
212 |
+
choices=list(original_df["LLM Provider"].unique()),
|
213 |
+
value=list(original_df["LLM Provider"].unique()),
|
214 |
+
label="LLM Provider",
|
215 |
+
info="",
|
216 |
+
interactive=True,
|
217 |
+
)
|
218 |
+
with gr.Row():
|
219 |
+
filter_use_case = gr.CheckboxGroup(
|
220 |
+
choices=list(original_df["Use Case Name"].unique()),
|
221 |
+
value=list(original_df["Use Case Name"].unique()),
|
222 |
+
label="Use Case",
|
223 |
+
info="",
|
224 |
+
# multiselect=True,
|
225 |
+
interactive=True,
|
226 |
+
)
|
227 |
+
with gr.Row():
|
228 |
+
with gr.Column():
|
229 |
+
filter_use_case_area = gr.CheckboxGroup(
|
230 |
choices=["Service", "Sales"],
|
231 |
value=["Service", "Sales"],
|
232 |
label="Use Case Area",
|
|
|
241 |
info="",
|
242 |
interactive=True,
|
243 |
)
|
244 |
+
# with gr.Column():
|
245 |
+
# filter_use_case = gr.Dropdown(
|
246 |
+
# choices=list(original_df["Use Case Name"].unique()),
|
247 |
+
# value=list(original_df["Use Case Name"].unique()),
|
248 |
+
# label="Use Case",
|
249 |
+
# info="",
|
250 |
+
# multiselect=True,
|
251 |
+
# interactive=True,
|
252 |
+
# )
|
253 |
with gr.Column():
|
254 |
filter_metric_area = gr.CheckboxGroup(
|
255 |
choices=["Accuracy", "Speed (Latency)", "Trust & Safety", "Cost"],
|
|
|
273 |
info="Range: 0.0 to 4.0",
|
274 |
interactive=True,
|
275 |
)
|
276 |
+
# with gr.Column():
|
277 |
+
# filter_llm = gr.CheckboxGroup(
|
278 |
+
# choices=list(original_df["Model Name"].unique()),
|
279 |
+
# value=list(leaderboard_df["Model Name"].unique()),
|
280 |
+
# label="Model Name",
|
281 |
+
# info="",
|
282 |
+
# interactive=True,
|
283 |
+
# )
|
284 |
+
# with gr.Column():
|
285 |
+
# filter_llm_provider = gr.CheckboxGroup(
|
286 |
+
# choices=list(original_df["LLM Provider"].unique()),
|
287 |
+
# value=list(leaderboard_df["LLM Provider"].unique()),
|
288 |
+
# label="LLM Provider",
|
289 |
+
# info="",
|
290 |
+
# interactive=True,
|
291 |
+
# )
|
292 |
|
293 |
leaderboard_table = gr.components.Dataframe(
|
294 |
+
# value=leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
|
295 |
+
value=init_leaderboard_df(
|
296 |
+
leaderboard_df,
|
297 |
+
shown_columns.value,
|
298 |
+
filter_llm.value,
|
299 |
+
filter_llm_provider.value,
|
300 |
+
filter_accuracy_method.value,
|
301 |
+
filter_use_case_area.value,
|
302 |
+
filter_use_case.value,
|
303 |
+
filter_use_case_type.value,
|
304 |
+
),
|
305 |
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
306 |
datatype=TYPES,
|
307 |
elem_id="leaderboard-table",
|
|
|
334 |
filter_llm,
|
335 |
filter_llm_provider,
|
336 |
filter_accuracy_method,
|
337 |
+
filter_use_case_area,
|
338 |
filter_use_case,
|
339 |
filter_use_case_type,
|
340 |
# filter_columns_type,
|
|
|
350 |
filter_llm,
|
351 |
filter_llm_provider,
|
352 |
filter_accuracy_method,
|
353 |
+
filter_use_case_area,
|
354 |
filter_use_case,
|
355 |
filter_use_case_type,
|
356 |
# filter_columns_type,
|
crm-results/hf_leaderboard_latency_cost.csv
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model Name,Use Case Type (Long vs Short),Platform,Mean Latency (sec) per Request,Mean Output Tokens,Mean Cost per 1K Requests,Cost Band
|
2 |
+
AI21 Jamba-Instruct,Long,AI21,4.0,232.9,1.6,High
|
3 |
+
AI21 Jamba-Instruct,Short,AI21,4.0,243.9,0.5,High
|
4 |
+
Claude 3 Haiku,Long,Bedrock,2.8,236.9,1.0,High
|
5 |
+
Claude 3 Haiku,Short,Bedrock,2.2,245.4,0.4,High
|
6 |
+
Claude 3 Opus,Long,Bedrock,12.2,242.7,61.1,High
|
7 |
+
Claude 3 Opus,Short,Bedrock,8.4,243.2,25.4,High
|
8 |
+
Cohere Command R+,Long,Bedrock,7.7,245.7,11.7,High
|
9 |
+
Cohere Command R+,Short,Bedrock,7.1,249.9,5.1,High
|
10 |
+
Cohere Command Text,Long,Bedrock,12.9,238.7,4.3,High
|
11 |
+
Cohere Command Text,Short,Bedrock,9.6,245.6,1.1,High
|
12 |
+
Gemini Pro 1.5,Long,Google,5.5,245.7,11.0,High
|
13 |
+
Gemini Pro 1.5,Short,Google,5.4,247.5,3.3,High
|
14 |
+
Gemini Pro 1,Long,Google,6.0,228.9,1.7,High
|
15 |
+
Gemini Pro 1,Short,Google,4.4,247.4,0.6,High
|
16 |
+
GPT 3.5 Turbo,Long,OpenAI,4.5,249.9,1.6,High
|
17 |
+
GPT 3.5 Turbo,Short,OpenAI,4.2,238.3,0.6,High
|
18 |
+
GPT 4 Turbo,Long,OpenAI,12.3,247.6,32.0,High
|
19 |
+
GPT 4 Turbo,Short,OpenAI,12.3,250.0,11.7,High
|
20 |
+
GPT4-o,Long,OpenAI,5.1,248.4,15.9,High
|
21 |
+
GPT4-o,Short,OpenAI,5.0,250.0,5.8,High
|
22 |
+
Mistral 7B,Long,Self-host (g5.48xlarge),8.83,242.0,16.5,High
|
23 |
+
Mistral 7B,Short,Self-host (g5.48xlarge),8.31,247.0,15.5,High
|
24 |
+
LLaMA 3 8B,Long,Self-host (g5.48xlarge),3.76,251.5,7.0,High
|
25 |
+
LLaMA 3 8B,Short,Self-host (g5.48xlarge),3.23,243.6,6.0,High
|
26 |
+
LLaMA 3 70B,Long,Self-host (p4d.24xlarge),20.1,243.9,67.7,High
|
27 |
+
LLaMA 3 70B,Short,Self-host (p4d.24xlarge),29.4,251.2,99.0,High
|
28 |
+
Mixtral 8x7B,Long,Self-host (p4d.24xlarge),2.44,248.5,8.22,High
|
29 |
+
Mixtral 8x7B,Short,Self-host (p4d.24xlarge),2.41,250.0,8.11,High
|
30 |
+
SF-TextBase 7B,Long,Self-host (g5.48xlarge),8.99,248.5,16.80,High
|
31 |
+
SF-TextBase 7B,Short,Self-host (g5.48xlarge),8.29,248.7,15.50,High
|
32 |
+
SF-TextBase 70B,Long,Self-host (p4de.24xlarge),6.52,253.7,28.17,High
|
33 |
+
SF-TextBase 70B,Short,Self-host (p4de.24xlarge),6.24,249.7,26.96,High
|
34 |
+
SF-TextSum,Long,Self-host (g5.48xlarge),8.85,244.0,16.55,High
|
35 |
+
SF-TextSum,Short,Self-host (g5.48xlarge),8.34,250.4,15.60,High
|
36 |
+
XGen 2,Long,Self-host (p4de.24xlarge),3.71,250.0,16.03,High
|
37 |
+
XGen 2,Short,Self-host (p4de.24xlarge),2.64,250.0,11.40,High
|
src/display/utils.py
CHANGED
@@ -25,14 +25,11 @@ class ColumnContent:
|
|
25 |
## Leaderboard columns
|
26 |
auto_eval_column_dict = []
|
27 |
# Init
|
28 |
-
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
29 |
auto_eval_column_dict.append(
|
30 |
["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
|
31 |
)
|
32 |
-
auto_eval_column_dict.append(
|
33 |
-
|
34 |
-
)
|
35 |
-
auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", True)])
|
36 |
auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
|
37 |
|
38 |
auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
|
|
|
25 |
## Leaderboard columns
|
26 |
auto_eval_column_dict = []
|
27 |
# Init
|
|
|
28 |
auto_eval_column_dict.append(
|
29 |
["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
|
30 |
)
|
31 |
+
auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
32 |
+
auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", False)])
|
|
|
|
|
33 |
auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
|
34 |
|
35 |
auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
|
src/populate.py
CHANGED
@@ -11,6 +11,8 @@ from src.leaderboard.read_evals import get_raw_eval_results
|
|
11 |
def get_leaderboard_df_crm(crm_results_path: str, cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
|
|
|
|
|
14 |
# leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
|
15 |
# by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
|
16 |
# )
|
|
|
11 |
def get_leaderboard_df_crm(crm_results_path: str, cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
|
14 |
+
sf_finetuned_models = ["SF-TextBase 70B", "SF-TextBase 7B", "SF-TextSum"]
|
15 |
+
leaderboard_accuracy_df = leaderboard_accuracy_df[~leaderboard_accuracy_df["Model Name"].isin(sf_finetuned_models)]
|
16 |
# leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
|
17 |
# by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
|
18 |
# )
|