Spaces:
Running
Running
add trust&safety table
Browse files- app.py +110 -2
- crm-results/hf_leaderboard_ts.csv +19 -0
- src/display/utils.py +10 -0
- src/populate.py +6 -1
app.py
CHANGED
@@ -13,9 +13,12 @@ from src.display.utils import ( # EVAL_TYPES,; WeightType,; BENCHMARK_COLS,; EV
|
|
13 |
COLS,
|
14 |
COST_COLS,
|
15 |
COST_TYPES,
|
|
|
|
|
16 |
TYPES,
|
17 |
AutoEvalColumn,
|
18 |
CostEvalColumn,
|
|
|
19 |
fields,
|
20 |
)
|
21 |
|
@@ -23,11 +26,12 @@ from src.display.utils import ( # EVAL_TYPES,; WeightType,; BENCHMARK_COLS,; EV
|
|
23 |
from src.envs import CRM_RESULTS_PATH
|
24 |
from src.populate import get_leaderboard_df_crm
|
25 |
|
26 |
-
original_df, cost_df = get_leaderboard_df_crm(CRM_RESULTS_PATH, COLS, COST_COLS)
|
27 |
|
28 |
# raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
29 |
leaderboard_df = original_df.copy()
|
30 |
leaderboard_cost_df = cost_df.copy()
|
|
|
31 |
# leaderboard_df = leaderboard_df.style.format({"accuracy_metric_average": "{0:.2f}"})
|
32 |
|
33 |
|
@@ -70,6 +74,18 @@ def update_cost_table(
|
|
70 |
return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
|
71 |
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
# def highlight_cols(x):
|
74 |
# df = x.copy()
|
75 |
# df.loc[:, :] = "color: black"
|
@@ -126,6 +142,21 @@ def init_leaderboard_cost_df(
|
|
126 |
)
|
127 |
|
128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
def filter_accuracy_method_func(df: pd.DataFrame, accuracy_method_query: str) -> pd.DataFrame:
|
130 |
return df[df["Accuracy Method"] == accuracy_method_query]
|
131 |
|
@@ -177,6 +208,14 @@ def select_columns_cost_table(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
|
177 |
return filtered_df
|
178 |
|
179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
demo = gr.Blocks(css=custom_css)
|
181 |
with demo:
|
182 |
gr.HTML(TITLE)
|
@@ -461,8 +500,77 @@ with demo:
|
|
461 |
leaderboard_table,
|
462 |
queue=True,
|
463 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
464 |
|
465 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
466 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
467 |
|
468 |
with gr.Row():
|
|
|
13 |
COLS,
|
14 |
COST_COLS,
|
15 |
COST_TYPES,
|
16 |
+
TS_COLS,
|
17 |
+
TS_TYPES,
|
18 |
TYPES,
|
19 |
AutoEvalColumn,
|
20 |
CostEvalColumn,
|
21 |
+
TSEvalColumn,
|
22 |
fields,
|
23 |
)
|
24 |
|
|
|
26 |
from src.envs import CRM_RESULTS_PATH
|
27 |
from src.populate import get_leaderboard_df_crm
|
28 |
|
29 |
+
original_df, cost_df, ts_df = get_leaderboard_df_crm(CRM_RESULTS_PATH, COLS, COST_COLS)
|
30 |
|
31 |
# raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
32 |
leaderboard_df = original_df.copy()
|
33 |
leaderboard_cost_df = cost_df.copy()
|
34 |
+
leaderboard_ts_df = ts_df.copy()
|
35 |
# leaderboard_df = leaderboard_df.style.format({"accuracy_metric_average": "{0:.2f}"})
|
36 |
|
37 |
|
|
|
74 |
return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
|
75 |
|
76 |
|
77 |
+
def update_ts_table(
|
78 |
+
hidden_df: pd.DataFrame,
|
79 |
+
columns: list,
|
80 |
+
llm_query: list,
|
81 |
+
llm_provider_query: list,
|
82 |
+
):
|
83 |
+
filtered_df = filter_llm_func(hidden_df, llm_query)
|
84 |
+
filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
|
85 |
+
df = select_columns_ts_table(filtered_df, columns)
|
86 |
+
return df
|
87 |
+
|
88 |
+
|
89 |
# def highlight_cols(x):
|
90 |
# df = x.copy()
|
91 |
# df.loc[:, :] = "color: black"
|
|
|
142 |
)
|
143 |
|
144 |
|
145 |
+
def init_leaderboard_ts_df(
|
146 |
+
leaderboard_df: pd.DataFrame,
|
147 |
+
columns: list,
|
148 |
+
llm_query: list,
|
149 |
+
llm_provider_query: list,
|
150 |
+
):
|
151 |
+
|
152 |
+
return update_ts_table(
|
153 |
+
leaderboard_df,
|
154 |
+
columns,
|
155 |
+
llm_query,
|
156 |
+
llm_provider_query,
|
157 |
+
)
|
158 |
+
|
159 |
+
|
160 |
def filter_accuracy_method_func(df: pd.DataFrame, accuracy_method_query: str) -> pd.DataFrame:
|
161 |
return df[df["Accuracy Method"] == accuracy_method_query]
|
162 |
|
|
|
208 |
return filtered_df
|
209 |
|
210 |
|
211 |
+
def select_columns_ts_table(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
212 |
+
always_here_cols = [
|
213 |
+
TSEvalColumn.model.name,
|
214 |
+
]
|
215 |
+
filtered_df = df[always_here_cols + [c for c in TS_COLS if c in df.columns and c in columns]]
|
216 |
+
return filtered_df
|
217 |
+
|
218 |
+
|
219 |
demo = gr.Blocks(css=custom_css)
|
220 |
with demo:
|
221 |
gr.HTML(TITLE)
|
|
|
500 |
leaderboard_table,
|
501 |
queue=True,
|
502 |
)
|
503 |
+
with gr.TabItem("🏅 Trust & Safety", elem_id="llm-benchmark-tab-table", id=2):
|
504 |
+
with gr.Row():
|
505 |
+
with gr.Column():
|
506 |
+
with gr.Row():
|
507 |
+
shown_columns = gr.CheckboxGroup(
|
508 |
+
choices=[c.name for c in fields(TSEvalColumn) if not c.hidden and not c.never_hidden],
|
509 |
+
value=[
|
510 |
+
c.name
|
511 |
+
for c in fields(TSEvalColumn)
|
512 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
513 |
+
],
|
514 |
+
label="Select columns to show",
|
515 |
+
elem_id="column-select",
|
516 |
+
interactive=True,
|
517 |
+
)
|
518 |
+
with gr.Row():
|
519 |
+
with gr.Column():
|
520 |
+
filter_llm = gr.CheckboxGroup(
|
521 |
+
choices=list(ts_df["Model Name"].unique()),
|
522 |
+
value=list(ts_df["Model Name"].unique()),
|
523 |
+
label="Model Name",
|
524 |
+
info="",
|
525 |
+
interactive=True,
|
526 |
+
)
|
527 |
+
with gr.Column():
|
528 |
+
filter_llm_provider = gr.CheckboxGroup(
|
529 |
+
choices=list(ts_df["LLM Provider"].unique()),
|
530 |
+
value=list(ts_df["LLM Provider"].unique()),
|
531 |
+
label="LLM Provider",
|
532 |
+
info="",
|
533 |
+
interactive=True,
|
534 |
+
)
|
535 |
|
536 |
+
leaderboard_table = gr.components.Dataframe(
|
537 |
+
value=init_leaderboard_ts_df(
|
538 |
+
leaderboard_ts_df,
|
539 |
+
shown_columns.value,
|
540 |
+
filter_llm.value,
|
541 |
+
filter_llm_provider.value,
|
542 |
+
),
|
543 |
+
headers=[c.name for c in fields(TSEvalColumn) if c.never_hidden] + shown_columns.value,
|
544 |
+
datatype=TS_TYPES,
|
545 |
+
elem_id="leaderboard-table",
|
546 |
+
interactive=False,
|
547 |
+
visible=True,
|
548 |
+
)
|
549 |
+
|
550 |
+
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
551 |
+
value=ts_df[TS_COLS],
|
552 |
+
headers=TS_COLS,
|
553 |
+
datatype=TS_TYPES,
|
554 |
+
visible=False,
|
555 |
+
)
|
556 |
+
|
557 |
+
for selector in [
|
558 |
+
shown_columns,
|
559 |
+
filter_llm,
|
560 |
+
filter_llm_provider,
|
561 |
+
]:
|
562 |
+
selector.change(
|
563 |
+
update_ts_table,
|
564 |
+
[
|
565 |
+
hidden_leaderboard_table_for_search,
|
566 |
+
shown_columns,
|
567 |
+
filter_llm,
|
568 |
+
filter_llm_provider,
|
569 |
+
],
|
570 |
+
leaderboard_table,
|
571 |
+
queue=True,
|
572 |
+
)
|
573 |
+
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
|
574 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
575 |
|
576 |
with gr.Row():
|
crm-results/hf_leaderboard_ts.csv
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model Name,Truthfulness,Safety,Privacy Zero-Shot Match Avoidance,Privacy Zero-Shot Reveal Avoidance,Privacy Five-Shot Match Avoidance,Privacy Five-Shot Reveal Avoidance,CRM Gender Bias,CRM Company Bias,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Truthfulness,,
|
2 |
+
GPT4-o,91%,69%,100%,94%,90%,51%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,91%,,
|
3 |
+
GPT 4 Turbo,94%,74%,100%,97%,86%,74%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,79%,0.813,
|
4 |
+
GPT 3.5 Turbo,45%,59%,100%,13%,36%,2%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,45%,,0.708 (ChatGPT)
|
5 |
+
AI21 Jamba-Instruct,68%,65%,100%,100%,90%,81%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,68%,,
|
6 |
+
Cohere Command Text,59%,54%,100%,84%,78%,40%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,59%,,
|
7 |
+
Claude 3 Haiku,86%,80%,100%,98%,95%,40%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,86%,,
|
8 |
+
Gemini Pro 1,87%,74%,100%,92%,81%,48%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,87%,,
|
9 |
+
SF-TextBase 70B,98%,63%,100%,90%,54%,8%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,98%,,
|
10 |
+
SF-TextSum,82%,51%,100%,89%,87%,27%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,82%,,
|
11 |
+
XGen 22B,52%,52%,100%,56%,81%,51%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,52%,,
|
12 |
+
SF-TextBase 7B,82%,60%,100%,83%,69%,27%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,82%,,
|
13 |
+
Mistral 7B,32%,42%,100%,97%,92%,82%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,32%,0.426,
|
14 |
+
Mixtral 8x7B,89%,59%,100%,97%,71%,55%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,89%,0.88,
|
15 |
+
LLaMA 3 8B,96%,76%,100%,99%,92%,85%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,96%,0.598,
|
16 |
+
LLaMA 3 70B,98%,74%,100%,98%,83%,75%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,98%,0.962,
|
17 |
+
Gemini Pro 1.5,98%,81%,100%,97%,87%,69%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,98%,,
|
18 |
+
Claude 3 Opus,94%,81%,100%,96%,80%,56%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,94%,,
|
19 |
+
Cohere Command R+,84%,56%,100%,97%,76%,45%,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,84%,,
|
src/display/utils.py
CHANGED
@@ -73,6 +73,13 @@ cost_eval_column_dict.append(["cost_band", ColumnContent, ColumnContent("Cost Ba
|
|
73 |
CostEvalColumn = make_dataclass("CostEvalColumn", cost_eval_column_dict, frozen=True)
|
74 |
|
75 |
# Trust & Safety metrics
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
|
78 |
# Scores
|
@@ -173,6 +180,9 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
|
|
173 |
COST_COLS = [c.name for c in fields(CostEvalColumn) if not c.hidden]
|
174 |
COST_TYPES = [c.type for c in fields(CostEvalColumn) if not c.hidden]
|
175 |
|
|
|
|
|
|
|
176 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
177 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
178 |
|
|
|
73 |
CostEvalColumn = make_dataclass("CostEvalColumn", cost_eval_column_dict, frozen=True)
|
74 |
|
75 |
# Trust & Safety metrics
|
76 |
+
ts_eval_column_dict = []
|
77 |
+
# Init
|
78 |
+
ts_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)])
|
79 |
+
ts_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
80 |
+
ts_eval_column_dict.append(["truthfulness", ColumnContent, ColumnContent("Truthfulness", "markdown", True)])
|
81 |
+
ts_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", True)])
|
82 |
+
TSEvalColumn = make_dataclass("TSEvalColumn", ts_eval_column_dict, frozen=True)
|
83 |
|
84 |
|
85 |
# Scores
|
|
|
180 |
COST_COLS = [c.name for c in fields(CostEvalColumn) if not c.hidden]
|
181 |
COST_TYPES = [c.type for c in fields(CostEvalColumn) if not c.hidden]
|
182 |
|
183 |
+
TS_COLS = [c.name for c in fields(TSEvalColumn) if not c.hidden]
|
184 |
+
TS_TYPES = [c.type for c in fields(TSEvalColumn) if not c.hidden]
|
185 |
+
|
186 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
187 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
188 |
|
src/populate.py
CHANGED
@@ -30,7 +30,12 @@ def get_leaderboard_df_crm(
|
|
30 |
leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
|
31 |
leaderboard_cost_df["LLM Provider"] = leaderboard_cost_df["LLM Provider"].fillna("Google")
|
32 |
leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
|
36 |
# def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
|
|
30 |
leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
|
31 |
leaderboard_cost_df["LLM Provider"] = leaderboard_cost_df["LLM Provider"].fillna("Google")
|
32 |
leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)
|
33 |
+
|
34 |
+
leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
|
35 |
+
leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
|
36 |
+
leaderboard_ts_df = leaderboard_ts_df.join(ref_df.set_index("Model Name"), on="Model Name")
|
37 |
+
|
38 |
+
return leaderboard_accuracy_df, leaderboard_cost_df, leaderboard_ts_df
|
39 |
|
40 |
|
41 |
# def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|