yibum commited on
Commit
1cade3b
·
1 Parent(s): 429ce41

add filter: Use Case Area

Browse files
app.py CHANGED
@@ -34,6 +34,7 @@ def update_table(
34
  llm_query: list,
35
  llm_provider_query: list,
36
  accuracy_method_query: str,
 
37
  use_case_query: list,
38
  use_case_type_query: list,
39
  # type_query: list,
@@ -49,20 +50,49 @@ def update_table(
49
  filtered_df = filter_accuracy_method_func(filtered_df, accuracy_method_query)
50
 
51
  filtered_df["Use Case Area"] = filtered_df["Use Case Name"].apply(lambda x: x.split(": ")[0])
52
- # print(filtered_df["Use Case Area"].unique())
53
  filtered_df = filter_use_case_func(filtered_df, use_case_query)
54
  filtered_df = filter_use_case_type_func(filtered_df, use_case_type_query)
55
  df = select_columns(filtered_df, columns)
56
  return df
57
 
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  def filter_accuracy_method_func(df: pd.DataFrame, accuracy_method_query: str) -> pd.DataFrame:
60
  return df[df["Accuracy Method"] == accuracy_method_query]
61
 
62
 
 
 
 
 
 
 
 
 
 
63
  def filter_use_case_func(df: pd.DataFrame, use_case_query: list) -> pd.DataFrame:
64
- # print(use_case_query)
65
- # print(df[df["Use Case Name"].isin(["Service: Conversation summary"])])
66
  return df[df["Use Case Name"].isin(use_case_query)]
67
 
68
 
@@ -170,7 +200,33 @@ with demo:
170
  # )
171
  with gr.Row():
172
  with gr.Column():
173
- filter_use_case_type = gr.CheckboxGroup(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  choices=["Service", "Sales"],
175
  value=["Service", "Sales"],
176
  label="Use Case Area",
@@ -185,15 +241,15 @@ with demo:
185
  info="",
186
  interactive=True,
187
  )
188
- with gr.Column():
189
- filter_use_case = gr.Dropdown(
190
- choices=list(original_df["Use Case Name"].unique()),
191
- value=list(original_df["Use Case Name"].unique()),
192
- label="Use Case",
193
- info="",
194
- multiselect=True,
195
- interactive=True,
196
- )
197
  with gr.Column():
198
  filter_metric_area = gr.CheckboxGroup(
199
  choices=["Accuracy", "Speed (Latency)", "Trust & Safety", "Cost"],
@@ -217,25 +273,35 @@ with demo:
217
  info="Range: 0.0 to 4.0",
218
  interactive=True,
219
  )
220
- with gr.Column():
221
- filter_llm = gr.CheckboxGroup(
222
- choices=list(original_df["Model Name"].unique()),
223
- value=list(leaderboard_df["Model Name"].unique()),
224
- label="Model Name",
225
- info="",
226
- interactive=True,
227
- )
228
- with gr.Column():
229
- filter_llm_provider = gr.CheckboxGroup(
230
- choices=list(original_df["LLM Provider"].unique()),
231
- value=list(leaderboard_df["LLM Provider"].unique()),
232
- label="LLM Provider",
233
- info="",
234
- interactive=True,
235
- )
236
 
237
  leaderboard_table = gr.components.Dataframe(
238
- value=leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
 
 
 
 
 
 
 
 
 
 
239
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
240
  datatype=TYPES,
241
  elem_id="leaderboard-table",
@@ -268,6 +334,7 @@ with demo:
268
  filter_llm,
269
  filter_llm_provider,
270
  filter_accuracy_method,
 
271
  filter_use_case,
272
  filter_use_case_type,
273
  # filter_columns_type,
@@ -283,6 +350,7 @@ with demo:
283
  filter_llm,
284
  filter_llm_provider,
285
  filter_accuracy_method,
 
286
  filter_use_case,
287
  filter_use_case_type,
288
  # filter_columns_type,
 
34
  llm_query: list,
35
  llm_provider_query: list,
36
  accuracy_method_query: str,
37
+ use_case_area_query: list,
38
  use_case_query: list,
39
  use_case_type_query: list,
40
  # type_query: list,
 
50
  filtered_df = filter_accuracy_method_func(filtered_df, accuracy_method_query)
51
 
52
  filtered_df["Use Case Area"] = filtered_df["Use Case Name"].apply(lambda x: x.split(": ")[0])
53
+ filtered_df = filter_use_case_area_func(filtered_df, use_case_area_query)
54
  filtered_df = filter_use_case_func(filtered_df, use_case_query)
55
  filtered_df = filter_use_case_type_func(filtered_df, use_case_type_query)
56
  df = select_columns(filtered_df, columns)
57
  return df
58
 
59
 
60
+ def init_leaderboard_df(
61
+ leaderboard_df: pd.DataFrame,
62
+ columns: list,
63
+ llm_query: list,
64
+ llm_provider_query: list,
65
+ accuracy_method_query: str,
66
+ use_case_area_query: list,
67
+ use_case_query: list,
68
+ use_case_type_query: list,
69
+ ):
70
+ return update_table(
71
+ leaderboard_df,
72
+ columns,
73
+ llm_query,
74
+ llm_provider_query,
75
+ accuracy_method_query,
76
+ use_case_area_query,
77
+ use_case_query,
78
+ use_case_type_query,
79
+ )
80
+
81
+
82
  def filter_accuracy_method_func(df: pd.DataFrame, accuracy_method_query: str) -> pd.DataFrame:
83
  return df[df["Accuracy Method"] == accuracy_method_query]
84
 
85
 
86
+ def filter_use_case_area_func(df: pd.DataFrame, use_case_area_query: list) -> pd.DataFrame:
87
+ return df[
88
+ df["Use Case Area"].apply(
89
+ lambda x: len(set([_.strip() for _ in x.split("&")]).intersection(use_case_area_query))
90
+ )
91
+ > 0
92
+ ]
93
+
94
+
95
  def filter_use_case_func(df: pd.DataFrame, use_case_query: list) -> pd.DataFrame:
 
 
96
  return df[df["Use Case Name"].isin(use_case_query)]
97
 
98
 
 
200
  # )
201
  with gr.Row():
202
  with gr.Column():
203
+ filter_llm = gr.CheckboxGroup(
204
+ choices=list(original_df["Model Name"].unique()),
205
+ value=list(original_df["Model Name"].unique()),
206
+ label="Model Name",
207
+ info="",
208
+ interactive=True,
209
+ )
210
+ with gr.Column():
211
+ filter_llm_provider = gr.CheckboxGroup(
212
+ choices=list(original_df["LLM Provider"].unique()),
213
+ value=list(original_df["LLM Provider"].unique()),
214
+ label="LLM Provider",
215
+ info="",
216
+ interactive=True,
217
+ )
218
+ with gr.Row():
219
+ filter_use_case = gr.CheckboxGroup(
220
+ choices=list(original_df["Use Case Name"].unique()),
221
+ value=list(original_df["Use Case Name"].unique()),
222
+ label="Use Case",
223
+ info="",
224
+ # multiselect=True,
225
+ interactive=True,
226
+ )
227
+ with gr.Row():
228
+ with gr.Column():
229
+ filter_use_case_area = gr.CheckboxGroup(
230
  choices=["Service", "Sales"],
231
  value=["Service", "Sales"],
232
  label="Use Case Area",
 
241
  info="",
242
  interactive=True,
243
  )
244
+ # with gr.Column():
245
+ # filter_use_case = gr.Dropdown(
246
+ # choices=list(original_df["Use Case Name"].unique()),
247
+ # value=list(original_df["Use Case Name"].unique()),
248
+ # label="Use Case",
249
+ # info="",
250
+ # multiselect=True,
251
+ # interactive=True,
252
+ # )
253
  with gr.Column():
254
  filter_metric_area = gr.CheckboxGroup(
255
  choices=["Accuracy", "Speed (Latency)", "Trust & Safety", "Cost"],
 
273
  info="Range: 0.0 to 4.0",
274
  interactive=True,
275
  )
276
+ # with gr.Column():
277
+ # filter_llm = gr.CheckboxGroup(
278
+ # choices=list(original_df["Model Name"].unique()),
279
+ # value=list(leaderboard_df["Model Name"].unique()),
280
+ # label="Model Name",
281
+ # info="",
282
+ # interactive=True,
283
+ # )
284
+ # with gr.Column():
285
+ # filter_llm_provider = gr.CheckboxGroup(
286
+ # choices=list(original_df["LLM Provider"].unique()),
287
+ # value=list(leaderboard_df["LLM Provider"].unique()),
288
+ # label="LLM Provider",
289
+ # info="",
290
+ # interactive=True,
291
+ # )
292
 
293
  leaderboard_table = gr.components.Dataframe(
294
+ # value=leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
295
+ value=init_leaderboard_df(
296
+ leaderboard_df,
297
+ shown_columns.value,
298
+ filter_llm.value,
299
+ filter_llm_provider.value,
300
+ filter_accuracy_method.value,
301
+ filter_use_case_area.value,
302
+ filter_use_case.value,
303
+ filter_use_case_type.value,
304
+ ),
305
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
306
  datatype=TYPES,
307
  elem_id="leaderboard-table",
 
334
  filter_llm,
335
  filter_llm_provider,
336
  filter_accuracy_method,
337
+ filter_use_case_area,
338
  filter_use_case,
339
  filter_use_case_type,
340
  # filter_columns_type,
 
350
  filter_llm,
351
  filter_llm_provider,
352
  filter_accuracy_method,
353
+ filter_use_case_area,
354
  filter_use_case,
355
  filter_use_case_type,
356
  # filter_columns_type,
crm-results/hf_leaderboard_latency_cost.csv ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model Name,Use Case Type (Long vs Short),Platform,Mean Latency (sec) per Request,Mean Output Tokens,Mean Cost per 1K Requests,Cost Band
2
+ AI21 Jamba-Instruct,Long,AI21,4.0,232.9,1.6,High
3
+ AI21 Jamba-Instruct,Short,AI21,4.0,243.9,0.5,High
4
+ Claude 3 Haiku,Long,Bedrock,2.8,236.9,1.0,High
5
+ Claude 3 Haiku,Short,Bedrock,2.2,245.4,0.4,High
6
+ Claude 3 Opus,Long,Bedrock,12.2,242.7,61.1,High
7
+ Claude 3 Opus,Short,Bedrock,8.4,243.2,25.4,High
8
+ Cohere Command R+,Long,Bedrock,7.7,245.7,11.7,High
9
+ Cohere Command R+,Short,Bedrock,7.1,249.9,5.1,High
10
+ Cohere Command Text,Long,Bedrock,12.9,238.7,4.3,High
11
+ Cohere Command Text,Short,Bedrock,9.6,245.6,1.1,High
12
+ Gemini Pro 1.5,Long,Google,5.5,245.7,11.0,High
13
+ Gemini Pro 1.5,Short,Google,5.4,247.5,3.3,High
14
+ Gemini Pro 1,Long,Google,6.0,228.9,1.7,High
15
+ Gemini Pro 1,Short,Google,4.4,247.4,0.6,High
16
+ GPT 3.5 Turbo,Long,OpenAI,4.5,249.9,1.6,High
17
+ GPT 3.5 Turbo,Short,OpenAI,4.2,238.3,0.6,High
18
+ GPT 4 Turbo,Long,OpenAI,12.3,247.6,32.0,High
19
+ GPT 4 Turbo,Short,OpenAI,12.3,250.0,11.7,High
20
+ GPT4-o,Long,OpenAI,5.1,248.4,15.9,High
21
+ GPT4-o,Short,OpenAI,5.0,250.0,5.8,High
22
+ Mistral 7B,Long,Self-host (g5.48xlarge),8.83,242.0,16.5,High
23
+ Mistral 7B,Short,Self-host (g5.48xlarge),8.31,247.0,15.5,High
24
+ LLaMA 3 8B,Long,Self-host (g5.48xlarge),3.76,251.5,7.0,High
25
+ LLaMA 3 8B,Short,Self-host (g5.48xlarge),3.23,243.6,6.0,High
26
+ LLaMA 3 70B,Long,Self-host (p4d.24xlarge),20.1,243.9,67.7,High
27
+ LLaMA 3 70B,Short,Self-host (p4d.24xlarge),29.4,251.2,99.0,High
28
+ Mixtral 8x7B,Long,Self-host (p4d.24xlarge),2.44,248.5,8.22,High
29
+ Mixtral 8x7B,Short,Self-host (p4d.24xlarge),2.41,250.0,8.11,High
30
+ SF-TextBase 7B,Long,Self-host (g5.48xlarge),8.99,248.5,16.80,High
31
+ SF-TextBase 7B,Short,Self-host (g5.48xlarge),8.29,248.7,15.50,High
32
+ SF-TextBase 70B,Long,Self-host (p4de.24xlarge),6.52,253.7,28.17,High
33
+ SF-TextBase 70B,Short,Self-host (p4de.24xlarge),6.24,249.7,26.96,High
34
+ SF-TextSum,Long,Self-host (g5.48xlarge),8.85,244.0,16.55,High
35
+ SF-TextSum,Short,Self-host (g5.48xlarge),8.34,250.4,15.60,High
36
+ XGen 2,Long,Self-host (p4de.24xlarge),3.71,250.0,16.03,High
37
+ XGen 2,Short,Self-host (p4de.24xlarge),2.64,250.0,11.40,High
src/display/utils.py CHANGED
@@ -25,14 +25,11 @@ class ColumnContent:
25
  ## Leaderboard columns
26
  auto_eval_column_dict = []
27
  # Init
28
- # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
29
  auto_eval_column_dict.append(
30
  ["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
31
  )
32
- auto_eval_column_dict.append(
33
- ["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True, never_hidden=True)]
34
- )
35
- auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", True)])
36
  auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
37
 
38
  auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
 
25
  ## Leaderboard columns
26
  auto_eval_column_dict = []
27
  # Init
 
28
  auto_eval_column_dict.append(
29
  ["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
30
  )
31
+ auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
32
+ auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", False)])
 
 
33
  auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
34
 
35
  auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
src/populate.py CHANGED
@@ -11,6 +11,8 @@ from src.leaderboard.read_evals import get_raw_eval_results
11
  def get_leaderboard_df_crm(crm_results_path: str, cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
 
 
14
  # leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
15
  # by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
16
  # )
 
11
  def get_leaderboard_df_crm(crm_results_path: str, cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
14
+ sf_finetuned_models = ["SF-TextBase 70B", "SF-TextBase 7B", "SF-TextSum"]
15
+ leaderboard_accuracy_df = leaderboard_accuracy_df[~leaderboard_accuracy_df["Model Name"].isin(sf_finetuned_models)]
16
  # leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
17
  # by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
18
  # )