Spaces:
Running
Running
join Trust & Safety table
Browse files- src/display/utils.py +14 -10
- src/populate.py +11 -5
src/display/utils.py
CHANGED
@@ -26,35 +26,39 @@ auto_eval_column_dict.append(
|
|
26 |
["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
|
27 |
)
|
28 |
auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
29 |
-
auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown",
|
30 |
auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
|
31 |
auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
|
32 |
# Accuracy metrics
|
33 |
-
auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown",
|
34 |
auto_eval_column_dict.append(
|
35 |
[
|
36 |
"accuracy_metric_instruction_following",
|
37 |
ColumnContent,
|
38 |
-
ColumnContent("Instruction Following", "markdown",
|
39 |
]
|
40 |
)
|
41 |
auto_eval_column_dict.append(
|
42 |
-
["accuracy_metric_completeness", ColumnContent, ColumnContent("Completeness", "markdown",
|
43 |
)
|
44 |
auto_eval_column_dict.append(
|
45 |
-
["accuracy_metric_conciseness", ColumnContent, ColumnContent("Conciseness", "markdown",
|
46 |
)
|
47 |
auto_eval_column_dict.append(
|
48 |
-
["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown",
|
49 |
)
|
50 |
-
#
|
51 |
-
# ["use_case_flavor", ColumnContent, ColumnContent("Cost and Speed: Flavor", "markdown", False)]
|
52 |
-
# )
|
53 |
auto_eval_column_dict.append(["latency", ColumnContent, ColumnContent("Response Time (Sec)", "markdown", True)])
|
54 |
auto_eval_column_dict.append(
|
55 |
["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
|
56 |
)
|
57 |
auto_eval_column_dict.append(["cost_band", ColumnContent, ColumnContent("Cost Band", "markdown", True)])
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
# We use make dataclass to dynamically fill the scores from Tasks
|
59 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
60 |
|
@@ -80,7 +84,7 @@ CostEvalColumn = make_dataclass("CostEvalColumn", cost_eval_column_dict, frozen=
|
|
80 |
ts_eval_column_dict = []
|
81 |
# Init
|
82 |
ts_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)])
|
83 |
-
ts_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
84 |
ts_eval_column_dict.append(["ts", ColumnContent, ColumnContent("Trust & Safety", "markdown", True)])
|
85 |
ts_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", False)])
|
86 |
ts_eval_column_dict.append(["privacy", ColumnContent, ColumnContent("Privacy", "markdown", False)])
|
|
|
26 |
["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
|
27 |
)
|
28 |
auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
29 |
+
auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", False)])
|
30 |
auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
|
31 |
auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
|
32 |
# Accuracy metrics
|
33 |
+
auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown", False)])
|
34 |
auto_eval_column_dict.append(
|
35 |
[
|
36 |
"accuracy_metric_instruction_following",
|
37 |
ColumnContent,
|
38 |
+
ColumnContent("Instruction Following", "markdown", False),
|
39 |
]
|
40 |
)
|
41 |
auto_eval_column_dict.append(
|
42 |
+
["accuracy_metric_completeness", ColumnContent, ColumnContent("Completeness", "markdown", False)]
|
43 |
)
|
44 |
auto_eval_column_dict.append(
|
45 |
+
["accuracy_metric_conciseness", ColumnContent, ColumnContent("Conciseness", "markdown", False)]
|
46 |
)
|
47 |
auto_eval_column_dict.append(
|
48 |
+
["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown", False)]
|
49 |
)
|
50 |
+
# Speed (Latency) & Cost metrics
|
|
|
|
|
51 |
auto_eval_column_dict.append(["latency", ColumnContent, ColumnContent("Response Time (Sec)", "markdown", True)])
|
52 |
auto_eval_column_dict.append(
|
53 |
["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
|
54 |
)
|
55 |
auto_eval_column_dict.append(["cost_band", ColumnContent, ColumnContent("Cost Band", "markdown", True)])
|
56 |
+
# Trust & Safety metrics
|
57 |
+
auto_eval_column_dict.append(["ts", ColumnContent, ColumnContent("Trust & Safety", "markdown", True)])
|
58 |
+
auto_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", False)])
|
59 |
+
auto_eval_column_dict.append(["privacy", ColumnContent, ColumnContent("Privacy", "markdown", False)])
|
60 |
+
auto_eval_column_dict.append(["truthfulness", ColumnContent, ColumnContent("Truthfulness", "markdown", False)])
|
61 |
+
auto_eval_column_dict.append(["crm_bias", ColumnContent, ColumnContent("CRM Bias", "markdown", False)])
|
62 |
# We use make dataclass to dynamically fill the scores from Tasks
|
63 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
64 |
|
|
|
84 |
ts_eval_column_dict = []
|
85 |
# Init
|
86 |
ts_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)])
|
87 |
+
# ts_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
88 |
ts_eval_column_dict.append(["ts", ColumnContent, ColumnContent("Trust & Safety", "markdown", True)])
|
89 |
ts_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", False)])
|
90 |
ts_eval_column_dict.append(["privacy", ColumnContent, ColumnContent("Privacy", "markdown", False)])
|
src/populate.py
CHANGED
@@ -31,10 +31,9 @@ def get_leaderboard_df_crm(
|
|
31 |
)
|
32 |
|
33 |
leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
|
34 |
-
|
35 |
leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
|
36 |
-
leaderboard_ts_df = leaderboard_ts_df.join(
|
37 |
-
leaderboard_ts_df = leaderboard_ts_df.join(leaderboard_ts__crm_bias_df.set_index("Model Name"), on="Model Name")
|
38 |
privacy_cols = leaderboard_ts_df[
|
39 |
[
|
40 |
"Privacy Zero-Shot Match Avoidance",
|
@@ -47,7 +46,7 @@ def get_leaderboard_df_crm(
|
|
47 |
leaderboard_ts_df["Privacy"] = privacy_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
|
48 |
leaderboard_ts_df["Bias No CI"] = leaderboard_ts_df["CRM Bias"].transform(lambda x: x.split(" ")[0])
|
49 |
|
50 |
-
|
51 |
[
|
52 |
"Safety",
|
53 |
"Privacy",
|
@@ -55,7 +54,14 @@ def get_leaderboard_df_crm(
|
|
55 |
"Bias No CI",
|
56 |
]
|
57 |
].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
|
58 |
-
leaderboard_ts_df["Trust & Safety"] =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
|
61 |
by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
|
|
|
31 |
)
|
32 |
|
33 |
leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
|
34 |
+
leaderboard_ts_crm_bias_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_crm_bias.csv"))
|
35 |
leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
|
36 |
+
leaderboard_ts_df = leaderboard_ts_df.join(leaderboard_ts_crm_bias_df.set_index("Model Name"), on="Model Name")
|
|
|
37 |
privacy_cols = leaderboard_ts_df[
|
38 |
[
|
39 |
"Privacy Zero-Shot Match Avoidance",
|
|
|
46 |
leaderboard_ts_df["Privacy"] = privacy_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
|
47 |
leaderboard_ts_df["Bias No CI"] = leaderboard_ts_df["CRM Bias"].transform(lambda x: x.split(" ")[0])
|
48 |
|
49 |
+
ts_lvl2_cols = leaderboard_ts_df[
|
50 |
[
|
51 |
"Safety",
|
52 |
"Privacy",
|
|
|
54 |
"Bias No CI",
|
55 |
]
|
56 |
].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
|
57 |
+
leaderboard_ts_df["Trust & Safety"] = ts_lvl2_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
|
58 |
+
|
59 |
+
leaderboard_accuracy_df = leaderboard_accuracy_df.join(
|
60 |
+
leaderboard_ts_df[ts_cols].set_index(["Model Name"]),
|
61 |
+
on=["Model Name"],
|
62 |
+
)
|
63 |
+
|
64 |
+
leaderboard_ts_df = leaderboard_ts_df.join(ref_df.set_index("Model Name"), on="Model Name")
|
65 |
|
66 |
leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
|
67 |
by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
|