yibum commited on
Commit
375e6bf
·
1 Parent(s): c64c31e

remove Trust & Safety tab

Browse files
Files changed (3) hide show
  1. app.py +33 -189
  2. src/display/utils.py +6 -6
  3. src/populate.py +1 -1
app.py CHANGED
@@ -3,14 +3,13 @@ import pandas as pd
3
 
4
  from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE
5
  from src.display.css_html_js import custom_css
6
- from src.display.utils import COLS, TS_COLS, TS_TYPES, TYPES, AutoEvalColumn, TSEvalColumn, fields
7
  from src.envs import CRM_RESULTS_PATH
8
  from src.populate import get_leaderboard_df_crm
9
 
10
- original_df, ts_df = get_leaderboard_df_crm(CRM_RESULTS_PATH, COLS, TS_COLS)
11
 
12
  leaderboard_df = original_df.copy()
13
- leaderboard_ts_df = ts_df.copy()
14
  # leaderboard_df = leaderboard_df.style.format({"accuracy_metric_average": "{0:.2f}"})
15
 
16
 
@@ -39,18 +38,6 @@ def update_table(
39
  return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
40
 
41
 
42
- def update_ts_table(
43
- hidden_df: pd.DataFrame,
44
- columns: list,
45
- llm_query: list,
46
- llm_provider_query: list,
47
- ):
48
- filtered_df = filter_llm_func(hidden_df, llm_query)
49
- filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
50
- df = select_columns_ts_table(filtered_df, columns)
51
- return df
52
-
53
-
54
  # def highlight_cols(x):
55
  # df = x.copy()
56
  # df.loc[:, :] = "color: black"
@@ -90,21 +77,6 @@ def init_leaderboard_df(
90
  )
91
 
92
 
93
- def init_leaderboard_ts_df(
94
- leaderboard_df: pd.DataFrame,
95
- columns: list,
96
- llm_query: list,
97
- llm_provider_query: list,
98
- ):
99
-
100
- return update_ts_table(
101
- leaderboard_df,
102
- columns,
103
- llm_query,
104
- llm_provider_query,
105
- )
106
-
107
-
108
  def filter_accuracy_method_func(df: pd.DataFrame, accuracy_method_query: str) -> pd.DataFrame:
109
  return df[df["Accuracy Method"] == accuracy_method_query]
110
 
@@ -139,6 +111,10 @@ def filter_llm_provider_func(df: pd.DataFrame, llm_provider_query: list) -> pd.D
139
  return df[df["LLM Provider"].isin(llm_provider_query)]
140
 
141
 
 
 
 
 
142
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
143
  always_here_cols = [
144
  AutoEvalColumn.model.name,
@@ -148,14 +124,6 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
148
  return filtered_df
149
 
150
 
151
- def select_columns_ts_table(df: pd.DataFrame, columns: list) -> pd.DataFrame:
152
- always_here_cols = [
153
- TSEvalColumn.model.name,
154
- ]
155
- filtered_df = df[always_here_cols + [c for c in TS_COLS if c in df.columns and c in columns]]
156
- return filtered_df
157
-
158
-
159
  demo = gr.Blocks(css=custom_css)
160
  with demo:
161
  gr.HTML(TITLE)
@@ -164,34 +132,17 @@ with demo:
164
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
165
  with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
166
  with gr.Row():
167
- with gr.Column():
168
- # with gr.Row():
169
- # search_bar = gr.Textbox(
170
- # placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
171
- # show_label=False,
172
- # elem_id="search-bar",
173
- # )
174
- with gr.Row():
175
- shown_columns = gr.CheckboxGroup(
176
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden],
177
- value=[
178
- c.name
179
- for c in fields(AutoEvalColumn)
180
- if c.displayed_by_default and not c.hidden and not c.never_hidden
181
- ],
182
- label="Select columns to show",
183
- elem_id="column-select",
184
- interactive=True,
185
- )
186
- # with gr.Column(min_width=320):
187
- # # with gr.Box(elem_id="box-filter"):
188
- # filter_columns_type = gr.CheckboxGroup(
189
- # label="Model types",
190
- # choices=[t.to_str() for t in ModelType],
191
- # value=[t.to_str() for t in ModelType],
192
- # interactive=True,
193
- # elem_id="filter-columns-type",
194
- # )
195
  with gr.Row():
196
  with gr.Column():
197
  filter_llm = gr.CheckboxGroup(
@@ -202,13 +153,22 @@ with demo:
202
  interactive=True,
203
  )
204
  with gr.Column():
205
- filter_llm_provider = gr.CheckboxGroup(
206
- choices=list(original_df["LLM Provider"].unique()),
207
- value=list(original_df["LLM Provider"].unique()),
208
- label="LLM Provider",
209
- info="",
210
- interactive=True,
211
- )
 
 
 
 
 
 
 
 
 
212
  with gr.Row():
213
  filter_use_case = gr.CheckboxGroup(
214
  choices=list(original_df["Use Case Name"].unique()),
@@ -244,14 +204,6 @@ with demo:
244
  # multiselect=True,
245
  # interactive=True,
246
  # )
247
- # with gr.Column():
248
- # filter_metric_area = gr.CheckboxGroup(
249
- # choices=["Accuracy", "Speed (Latency)", "Trust & Safety", "Cost"],
250
- # value=["Accuracy", "Speed (Latency)", "Trust & Safety", "Cost"],
251
- # label="Metric Area",
252
- # info="",
253
- # interactive=True,
254
- # )
255
  with gr.Column():
256
  filter_accuracy_method = gr.Radio(
257
  choices=["Manual", "Auto"],
@@ -267,22 +219,6 @@ with demo:
267
  info="Range: 0.0 to 4.0",
268
  interactive=True,
269
  )
270
- # with gr.Column():
271
- # filter_llm = gr.CheckboxGroup(
272
- # choices=list(original_df["Model Name"].unique()),
273
- # value=list(leaderboard_df["Model Name"].unique()),
274
- # label="Model Name",
275
- # info="",
276
- # interactive=True,
277
- # )
278
- # with gr.Column():
279
- # filter_llm_provider = gr.CheckboxGroup(
280
- # choices=list(original_df["LLM Provider"].unique()),
281
- # value=list(leaderboard_df["LLM Provider"].unique()),
282
- # label="LLM Provider",
283
- # info="",
284
- # interactive=True,
285
- # )
286
 
287
  leaderboard_table = gr.components.Dataframe(
288
  # value=leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
@@ -311,19 +247,6 @@ with demo:
311
  datatype=TYPES,
312
  visible=False,
313
  )
314
- # search_bar.submit(
315
- # update_table,
316
- # [
317
- # hidden_leaderboard_table_for_search,
318
- # shown_columns,
319
- # filter_columns_type,
320
- # filter_columns_precision,
321
- # filter_columns_size,
322
- # deleted_models_visibility,
323
- # search_bar,
324
- # ],
325
- # leaderboard_table,
326
- # )
327
  for selector in [
328
  shown_columns,
329
  filter_llm,
@@ -333,10 +256,6 @@ with demo:
333
  filter_use_case_area,
334
  filter_use_case,
335
  filter_use_case_type,
336
- # filter_columns_type,
337
- # filter_columns_precision,
338
- # filter_columns_size,
339
- # deleted_models_visibility,
340
  ]:
341
  selector.change(
342
  update_table,
@@ -350,81 +269,6 @@ with demo:
350
  filter_use_case_area,
351
  filter_use_case,
352
  filter_use_case_type,
353
- # filter_columns_type,
354
- # filter_columns_precision,
355
- # filter_columns_size,
356
- # deleted_models_visibility,
357
- # search_bar,
358
- ],
359
- leaderboard_table,
360
- queue=True,
361
- )
362
- with gr.TabItem("🏅 Trust & Safety", elem_id="llm-benchmark-tab-table", id=2):
363
- with gr.Row():
364
- with gr.Column():
365
- with gr.Row():
366
- shown_columns = gr.CheckboxGroup(
367
- choices=[c.name for c in fields(TSEvalColumn) if not c.hidden and not c.never_hidden],
368
- value=[
369
- c.name
370
- for c in fields(TSEvalColumn)
371
- if c.displayed_by_default and not c.hidden and not c.never_hidden
372
- ],
373
- label="Select columns to show",
374
- elem_id="column-select",
375
- interactive=True,
376
- )
377
- with gr.Row():
378
- with gr.Column():
379
- filter_llm = gr.CheckboxGroup(
380
- choices=list(ts_df["Model Name"].unique()),
381
- value=list(ts_df["Model Name"].unique()),
382
- label="Model Name",
383
- info="",
384
- interactive=True,
385
- )
386
- with gr.Column():
387
- filter_llm_provider = gr.CheckboxGroup(
388
- choices=list(ts_df["LLM Provider"].unique()),
389
- value=list(ts_df["LLM Provider"].unique()),
390
- label="LLM Provider",
391
- info="",
392
- interactive=True,
393
- )
394
-
395
- leaderboard_table = gr.components.Dataframe(
396
- value=init_leaderboard_ts_df(
397
- leaderboard_ts_df,
398
- shown_columns.value,
399
- filter_llm.value,
400
- filter_llm_provider.value,
401
- ),
402
- headers=[c.name for c in fields(TSEvalColumn) if c.never_hidden] + shown_columns.value,
403
- datatype=TS_TYPES,
404
- elem_id="leaderboard-table",
405
- interactive=False,
406
- visible=True,
407
- )
408
-
409
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
410
- value=ts_df[TS_COLS],
411
- headers=TS_COLS,
412
- datatype=TS_TYPES,
413
- visible=False,
414
- )
415
-
416
- for selector in [
417
- shown_columns,
418
- filter_llm,
419
- filter_llm_provider,
420
- ]:
421
- selector.change(
422
- update_ts_table,
423
- [
424
- hidden_leaderboard_table_for_search,
425
- shown_columns,
426
- filter_llm,
427
- filter_llm_provider,
428
  ],
429
  leaderboard_table,
430
  queue=True,
 
3
 
4
  from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE
5
  from src.display.css_html_js import custom_css
6
+ from src.display.utils import COLS, TS_COLS, TYPES, AutoEvalColumn, fields
7
  from src.envs import CRM_RESULTS_PATH
8
  from src.populate import get_leaderboard_df_crm
9
 
10
+ original_df = get_leaderboard_df_crm(CRM_RESULTS_PATH, COLS, TS_COLS)
11
 
12
  leaderboard_df = original_df.copy()
 
13
  # leaderboard_df = leaderboard_df.style.format({"accuracy_metric_average": "{0:.2f}"})
14
 
15
 
 
38
  return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
39
 
40
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  # def highlight_cols(x):
42
  # df = x.copy()
43
  # df.loc[:, :] = "color: black"
 
77
  )
78
 
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  def filter_accuracy_method_func(df: pd.DataFrame, accuracy_method_query: str) -> pd.DataFrame:
81
  return df[df["Accuracy Method"] == accuracy_method_query]
82
 
 
111
  return df[df["LLM Provider"].isin(llm_provider_query)]
112
 
113
 
114
+ def filter_metric_area_func(df: pd.DataFrame, metric_area_query: list) -> pd.DataFrame:
115
+ return df[df["Metric Area"].isin(metric_area_query)]
116
+
117
+
118
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
119
  always_here_cols = [
120
  AutoEvalColumn.model.name,
 
124
  return filtered_df
125
 
126
 
 
 
 
 
 
 
 
 
127
  demo = gr.Blocks(css=custom_css)
128
  with demo:
129
  gr.HTML(TITLE)
 
132
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
133
  with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
134
  with gr.Row():
135
+ shown_columns = gr.CheckboxGroup(
136
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden],
137
+ value=[
138
+ c.name
139
+ for c in fields(AutoEvalColumn)
140
+ if c.displayed_by_default and not c.hidden and not c.never_hidden
141
+ ],
142
+ label="Select columns to show",
143
+ elem_id="column-select",
144
+ interactive=True,
145
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  with gr.Row():
147
  with gr.Column():
148
  filter_llm = gr.CheckboxGroup(
 
153
  interactive=True,
154
  )
155
  with gr.Column():
156
+ with gr.Row():
157
+ filter_llm_provider = gr.CheckboxGroup(
158
+ choices=list(original_df["LLM Provider"].unique()),
159
+ value=list(original_df["LLM Provider"].unique()),
160
+ label="LLM Provider",
161
+ info="",
162
+ interactive=True,
163
+ )
164
+ with gr.Row():
165
+ filter_metric_area = gr.CheckboxGroup(
166
+ choices=["Accuracy", "Speed (Latency)", "Trust & Safety", "Cost"],
167
+ value=["Accuracy", "Speed (Latency)", "Trust & Safety", "Cost"],
168
+ label="Metric Area",
169
+ info="",
170
+ interactive=True,
171
+ )
172
  with gr.Row():
173
  filter_use_case = gr.CheckboxGroup(
174
  choices=list(original_df["Use Case Name"].unique()),
 
204
  # multiselect=True,
205
  # interactive=True,
206
  # )
 
 
 
 
 
 
 
 
207
  with gr.Column():
208
  filter_accuracy_method = gr.Radio(
209
  choices=["Manual", "Auto"],
 
219
  info="Range: 0.0 to 4.0",
220
  interactive=True,
221
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
  leaderboard_table = gr.components.Dataframe(
224
  # value=leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
 
247
  datatype=TYPES,
248
  visible=False,
249
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  for selector in [
251
  shown_columns,
252
  filter_llm,
 
256
  filter_use_case_area,
257
  filter_use_case,
258
  filter_use_case_type,
 
 
 
 
259
  ]:
260
  selector.change(
261
  update_table,
 
269
  filter_use_case_area,
270
  filter_use_case,
271
  filter_use_case_type,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  ],
273
  leaderboard_table,
274
  queue=True,
src/display/utils.py CHANGED
@@ -26,26 +26,26 @@ auto_eval_column_dict.append(
26
  ["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
27
  )
28
  auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
29
- auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", False)])
30
  auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
31
  auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
32
  # Accuracy metrics
33
- auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown", False)])
34
  auto_eval_column_dict.append(
35
  [
36
  "accuracy_metric_instruction_following",
37
  ColumnContent,
38
- ColumnContent("Instruction Following", "markdown", False),
39
  ]
40
  )
41
  auto_eval_column_dict.append(
42
- ["accuracy_metric_completeness", ColumnContent, ColumnContent("Completeness", "markdown", False)]
43
  )
44
  auto_eval_column_dict.append(
45
- ["accuracy_metric_conciseness", ColumnContent, ColumnContent("Conciseness", "markdown", False)]
46
  )
47
  auto_eval_column_dict.append(
48
- ["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown", False)]
49
  )
50
  # Speed (Latency) & Cost metrics
51
  auto_eval_column_dict.append(["latency", ColumnContent, ColumnContent("Response Time (Sec)", "markdown", True)])
 
26
  ["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
27
  )
28
  auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
29
+ auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", True)])
30
  auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
31
  auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
32
  # Accuracy metrics
33
+ auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown", True)])
34
  auto_eval_column_dict.append(
35
  [
36
  "accuracy_metric_instruction_following",
37
  ColumnContent,
38
+ ColumnContent("Instruction Following", "markdown", True),
39
  ]
40
  )
41
  auto_eval_column_dict.append(
42
+ ["accuracy_metric_completeness", ColumnContent, ColumnContent("Completeness", "markdown", True)]
43
  )
44
  auto_eval_column_dict.append(
45
+ ["accuracy_metric_conciseness", ColumnContent, ColumnContent("Conciseness", "markdown", True)]
46
  )
47
  auto_eval_column_dict.append(
48
+ ["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown", True)]
49
  )
50
  # Speed (Latency) & Cost metrics
51
  auto_eval_column_dict.append(["latency", ColumnContent, ColumnContent("Response Time (Sec)", "markdown", True)])
src/populate.py CHANGED
@@ -67,4 +67,4 @@ def get_leaderboard_df_crm(
67
  by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
68
  )
69
  leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2)
70
- return leaderboard_accuracy_df, leaderboard_ts_df
 
67
  by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
68
  )
69
  leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2)
70
+ return leaderboard_accuracy_df