yibum commited on
Commit
2bc2f6b
·
1 Parent(s): ada4cd8

add trust&safety table

Browse files
app.py CHANGED
@@ -13,9 +13,12 @@ from src.display.utils import ( # EVAL_TYPES,; WeightType,; BENCHMARK_COLS,; EV
13
  COLS,
14
  COST_COLS,
15
  COST_TYPES,
 
 
16
  TYPES,
17
  AutoEvalColumn,
18
  CostEvalColumn,
 
19
  fields,
20
  )
21
 
@@ -23,11 +26,12 @@ from src.display.utils import ( # EVAL_TYPES,; WeightType,; BENCHMARK_COLS,; EV
23
  from src.envs import CRM_RESULTS_PATH
24
  from src.populate import get_leaderboard_df_crm
25
 
26
- original_df, cost_df = get_leaderboard_df_crm(CRM_RESULTS_PATH, COLS, COST_COLS)
27
 
28
  # raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
29
  leaderboard_df = original_df.copy()
30
  leaderboard_cost_df = cost_df.copy()
 
31
  # leaderboard_df = leaderboard_df.style.format({"accuracy_metric_average": "{0:.2f}"})
32
 
33
 
@@ -70,6 +74,18 @@ def update_cost_table(
70
  return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
71
 
72
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  # def highlight_cols(x):
74
  # df = x.copy()
75
  # df.loc[:, :] = "color: black"
@@ -126,6 +142,21 @@ def init_leaderboard_cost_df(
126
  )
127
 
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  def filter_accuracy_method_func(df: pd.DataFrame, accuracy_method_query: str) -> pd.DataFrame:
130
  return df[df["Accuracy Method"] == accuracy_method_query]
131
 
@@ -177,6 +208,14 @@ def select_columns_cost_table(df: pd.DataFrame, columns: list) -> pd.DataFrame:
177
  return filtered_df
178
 
179
 
 
 
 
 
 
 
 
 
180
  demo = gr.Blocks(css=custom_css)
181
  with demo:
182
  gr.HTML(TITLE)
@@ -461,8 +500,77 @@ with demo:
461
  leaderboard_table,
462
  queue=True,
463
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
 
465
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
466
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
467
 
468
  with gr.Row():
 
13
  COLS,
14
  COST_COLS,
15
  COST_TYPES,
16
+ TS_COLS,
17
+ TS_TYPES,
18
  TYPES,
19
  AutoEvalColumn,
20
  CostEvalColumn,
21
+ TSEvalColumn,
22
  fields,
23
  )
24
 
 
26
  from src.envs import CRM_RESULTS_PATH
27
  from src.populate import get_leaderboard_df_crm
28
 
29
+ original_df, cost_df, ts_df = get_leaderboard_df_crm(CRM_RESULTS_PATH, COLS, COST_COLS)
30
 
31
  # raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
32
  leaderboard_df = original_df.copy()
33
  leaderboard_cost_df = cost_df.copy()
34
+ leaderboard_ts_df = ts_df.copy()
35
  # leaderboard_df = leaderboard_df.style.format({"accuracy_metric_average": "{0:.2f}"})
36
 
37
 
 
74
  return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
75
 
76
 
77
+ def update_ts_table(
78
+ hidden_df: pd.DataFrame,
79
+ columns: list,
80
+ llm_query: list,
81
+ llm_provider_query: list,
82
+ ):
83
+ filtered_df = filter_llm_func(hidden_df, llm_query)
84
+ filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
85
+ df = select_columns_ts_table(filtered_df, columns)
86
+ return df
87
+
88
+
89
  # def highlight_cols(x):
90
  # df = x.copy()
91
  # df.loc[:, :] = "color: black"
 
142
  )
143
 
144
 
145
+ def init_leaderboard_ts_df(
146
+ leaderboard_df: pd.DataFrame,
147
+ columns: list,
148
+ llm_query: list,
149
+ llm_provider_query: list,
150
+ ):
151
+
152
+ return update_ts_table(
153
+ leaderboard_df,
154
+ columns,
155
+ llm_query,
156
+ llm_provider_query,
157
+ )
158
+
159
+
160
  def filter_accuracy_method_func(df: pd.DataFrame, accuracy_method_query: str) -> pd.DataFrame:
161
  return df[df["Accuracy Method"] == accuracy_method_query]
162
 
 
208
  return filtered_df
209
 
210
 
211
+ def select_columns_ts_table(df: pd.DataFrame, columns: list) -> pd.DataFrame:
212
+ always_here_cols = [
213
+ TSEvalColumn.model.name,
214
+ ]
215
+ filtered_df = df[always_here_cols + [c for c in TS_COLS if c in df.columns and c in columns]]
216
+ return filtered_df
217
+
218
+
219
  demo = gr.Blocks(css=custom_css)
220
  with demo:
221
  gr.HTML(TITLE)
 
500
  leaderboard_table,
501
  queue=True,
502
  )
503
+ with gr.TabItem("🏅 Trust & Safety", elem_id="llm-benchmark-tab-table", id=2):
504
+ with gr.Row():
505
+ with gr.Column():
506
+ with gr.Row():
507
+ shown_columns = gr.CheckboxGroup(
508
+ choices=[c.name for c in fields(TSEvalColumn) if not c.hidden and not c.never_hidden],
509
+ value=[
510
+ c.name
511
+ for c in fields(TSEvalColumn)
512
+ if c.displayed_by_default and not c.hidden and not c.never_hidden
513
+ ],
514
+ label="Select columns to show",
515
+ elem_id="column-select",
516
+ interactive=True,
517
+ )
518
+ with gr.Row():
519
+ with gr.Column():
520
+ filter_llm = gr.CheckboxGroup(
521
+ choices=list(ts_df["Model Name"].unique()),
522
+ value=list(ts_df["Model Name"].unique()),
523
+ label="Model Name",
524
+ info="",
525
+ interactive=True,
526
+ )
527
+ with gr.Column():
528
+ filter_llm_provider = gr.CheckboxGroup(
529
+ choices=list(ts_df["LLM Provider"].unique()),
530
+ value=list(ts_df["LLM Provider"].unique()),
531
+ label="LLM Provider",
532
+ info="",
533
+ interactive=True,
534
+ )
535
 
536
+ leaderboard_table = gr.components.Dataframe(
537
+ value=init_leaderboard_ts_df(
538
+ leaderboard_ts_df,
539
+ shown_columns.value,
540
+ filter_llm.value,
541
+ filter_llm_provider.value,
542
+ ),
543
+ headers=[c.name for c in fields(TSEvalColumn) if c.never_hidden] + shown_columns.value,
544
+ datatype=TS_TYPES,
545
+ elem_id="leaderboard-table",
546
+ interactive=False,
547
+ visible=True,
548
+ )
549
+
550
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
551
+ value=ts_df[TS_COLS],
552
+ headers=TS_COLS,
553
+ datatype=TS_TYPES,
554
+ visible=False,
555
+ )
556
+
557
+ for selector in [
558
+ shown_columns,
559
+ filter_llm,
560
+ filter_llm_provider,
561
+ ]:
562
+ selector.change(
563
+ update_ts_table,
564
+ [
565
+ hidden_leaderboard_table_for_search,
566
+ shown_columns,
567
+ filter_llm,
568
+ filter_llm_provider,
569
+ ],
570
+ leaderboard_table,
571
+ queue=True,
572
+ )
573
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
574
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
575
 
576
  with gr.Row():
crm-results/hf_leaderboard_ts.csv ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model Name,Truthfulness,Safety,Privacy Zero-Shot Match Avoidance,Privacy Zero-Shot Reveal Avoidance,Privacy Five-Shot Match Avoidance,Privacy Five-Shot Reveal Avoidance,CRM Gender Bias,CRM Company Bias,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Truthfulness,,
2
+ GPT4-o,91%,69%,100%,94%,90%,51%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,91%,,
3
+ GPT 4 Turbo,94%,74%,100%,97%,86%,74%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,79%,0.813,
4
+ GPT 3.5 Turbo,45%,59%,100%,13%,36%,2%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,45%,,0.708 (ChatGPT)
5
+ AI21 Jamba-Instruct,68%,65%,100%,100%,90%,81%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,68%,,
6
+ Cohere Command Text,59%,54%,100%,84%,78%,40%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,59%,,
7
+ Claude 3 Haiku,86%,80%,100%,98%,95%,40%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,86%,,
8
+ Gemini Pro 1,87%,74%,100%,92%,81%,48%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,87%,,
9
+ SF-TextBase 70B,98%,63%,100%,90%,54%,8%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,98%,,
10
+ SF-TextSum,82%,51%,100%,89%,87%,27%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,82%,,
11
+ XGen 22B,52%,52%,100%,56%,81%,51%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,52%,,
12
+ SF-TextBase 7B,82%,60%,100%,83%,69%,27%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,82%,,
13
+ Mistral 7B,32%,42%,100%,97%,92%,82%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,32%,0.426,
14
+ Mixtral 8x7B,89%,59%,100%,97%,71%,55%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,89%,0.88,
15
+ LLaMA 3 8B,96%,76%,100%,99%,92%,85%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,96%,0.598,
16
+ LLaMA 3 70B,98%,74%,100%,98%,83%,75%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,98%,0.962,
17
+ Gemini Pro 1.5,98%,81%,100%,97%,87%,69%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,98%,,
18
+ Claude 3 Opus,94%,81%,100%,96%,80%,56%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,94%,,
19
+ Cohere Command R+,84%,56%,100%,97%,76%,45%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,84%,,
src/display/utils.py CHANGED
@@ -73,6 +73,13 @@ cost_eval_column_dict.append(["cost_band", ColumnContent, ColumnContent("Cost Ba
73
  CostEvalColumn = make_dataclass("CostEvalColumn", cost_eval_column_dict, frozen=True)
74
 
75
  # Trust & Safety metrics
 
 
 
 
 
 
 
76
 
77
 
78
  # Scores
@@ -173,6 +180,9 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
173
  COST_COLS = [c.name for c in fields(CostEvalColumn) if not c.hidden]
174
  COST_TYPES = [c.type for c in fields(CostEvalColumn) if not c.hidden]
175
 
 
 
 
176
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
177
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
178
 
 
73
  CostEvalColumn = make_dataclass("CostEvalColumn", cost_eval_column_dict, frozen=True)
74
 
75
  # Trust & Safety metrics
76
+ ts_eval_column_dict = []
77
+ # Init
78
+ ts_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)])
79
+ ts_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
80
+ ts_eval_column_dict.append(["truthfulness", ColumnContent, ColumnContent("Truthfulness", "markdown", True)])
81
+ ts_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", True)])
82
+ TSEvalColumn = make_dataclass("TSEvalColumn", ts_eval_column_dict, frozen=True)
83
 
84
 
85
  # Scores
 
180
  COST_COLS = [c.name for c in fields(CostEvalColumn) if not c.hidden]
181
  COST_TYPES = [c.type for c in fields(CostEvalColumn) if not c.hidden]
182
 
183
+ TS_COLS = [c.name for c in fields(TSEvalColumn) if not c.hidden]
184
+ TS_TYPES = [c.type for c in fields(TSEvalColumn) if not c.hidden]
185
+
186
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
187
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
188
 
src/populate.py CHANGED
@@ -30,7 +30,12 @@ def get_leaderboard_df_crm(
30
  leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
31
  leaderboard_cost_df["LLM Provider"] = leaderboard_cost_df["LLM Provider"].fillna("Google")
32
  leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)
33
- return leaderboard_accuracy_df, leaderboard_cost_df
 
 
 
 
 
34
 
35
 
36
  # def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
 
30
  leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
31
  leaderboard_cost_df["LLM Provider"] = leaderboard_cost_df["LLM Provider"].fillna("Google")
32
  leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)
33
+
34
+ leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
35
+ leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
36
+ leaderboard_ts_df = leaderboard_ts_df.join(ref_df.set_index("Model Name"), on="Model Name")
37
+
38
+ return leaderboard_accuracy_df, leaderboard_cost_df, leaderboard_ts_df
39
 
40
 
41
  # def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame: