yibum commited on
Commit
c64c31e
·
1 Parent(s): c4f7097

join Trust & Safety table

Browse files
Files changed (2) hide show
  1. src/display/utils.py +14 -10
  2. src/populate.py +11 -5
src/display/utils.py CHANGED
@@ -26,35 +26,39 @@ auto_eval_column_dict.append(
26
  ["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
27
  )
28
  auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
29
- auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", True)])
30
  auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
31
  auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
32
  # Accuracy metrics
33
- auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown", True)])
34
  auto_eval_column_dict.append(
35
  [
36
  "accuracy_metric_instruction_following",
37
  ColumnContent,
38
- ColumnContent("Instruction Following", "markdown", True),
39
  ]
40
  )
41
  auto_eval_column_dict.append(
42
- ["accuracy_metric_completeness", ColumnContent, ColumnContent("Completeness", "markdown", True)]
43
  )
44
  auto_eval_column_dict.append(
45
- ["accuracy_metric_conciseness", ColumnContent, ColumnContent("Conciseness", "markdown", True)]
46
  )
47
  auto_eval_column_dict.append(
48
- ["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown", True)]
49
  )
50
- # auto_eval_column_dict.append(
51
- # ["use_case_flavor", ColumnContent, ColumnContent("Cost and Speed: Flavor", "markdown", False)]
52
- # )
53
  auto_eval_column_dict.append(["latency", ColumnContent, ColumnContent("Response Time (Sec)", "markdown", True)])
54
  auto_eval_column_dict.append(
55
  ["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
56
  )
57
  auto_eval_column_dict.append(["cost_band", ColumnContent, ColumnContent("Cost Band", "markdown", True)])
 
 
 
 
 
 
58
  # We use make dataclass to dynamically fill the scores from Tasks
59
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
60
 
@@ -80,7 +84,7 @@ CostEvalColumn = make_dataclass("CostEvalColumn", cost_eval_column_dict, frozen=
80
  ts_eval_column_dict = []
81
  # Init
82
  ts_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)])
83
- ts_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
84
  ts_eval_column_dict.append(["ts", ColumnContent, ColumnContent("Trust & Safety", "markdown", True)])
85
  ts_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", False)])
86
  ts_eval_column_dict.append(["privacy", ColumnContent, ColumnContent("Privacy", "markdown", False)])
 
26
  ["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
27
  )
28
  auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
29
+ auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", False)])
30
  auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
31
  auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
32
  # Accuracy metrics
33
+ auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown", False)])
34
  auto_eval_column_dict.append(
35
  [
36
  "accuracy_metric_instruction_following",
37
  ColumnContent,
38
+ ColumnContent("Instruction Following", "markdown", False),
39
  ]
40
  )
41
  auto_eval_column_dict.append(
42
+ ["accuracy_metric_completeness", ColumnContent, ColumnContent("Completeness", "markdown", False)]
43
  )
44
  auto_eval_column_dict.append(
45
+ ["accuracy_metric_conciseness", ColumnContent, ColumnContent("Conciseness", "markdown", False)]
46
  )
47
  auto_eval_column_dict.append(
48
+ ["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown", False)]
49
  )
50
+ # Speed (Latency) & Cost metrics
 
 
51
  auto_eval_column_dict.append(["latency", ColumnContent, ColumnContent("Response Time (Sec)", "markdown", True)])
52
  auto_eval_column_dict.append(
53
  ["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
54
  )
55
  auto_eval_column_dict.append(["cost_band", ColumnContent, ColumnContent("Cost Band", "markdown", True)])
56
+ # Trust & Safety metrics
57
+ auto_eval_column_dict.append(["ts", ColumnContent, ColumnContent("Trust & Safety", "markdown", True)])
58
+ auto_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", False)])
59
+ auto_eval_column_dict.append(["privacy", ColumnContent, ColumnContent("Privacy", "markdown", False)])
60
+ auto_eval_column_dict.append(["truthfulness", ColumnContent, ColumnContent("Truthfulness", "markdown", False)])
61
+ auto_eval_column_dict.append(["crm_bias", ColumnContent, ColumnContent("CRM Bias", "markdown", False)])
62
  # We use make dataclass to dynamically fill the scores from Tasks
63
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
64
 
 
84
  ts_eval_column_dict = []
85
  # Init
86
  ts_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)])
87
+ # ts_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
88
  ts_eval_column_dict.append(["ts", ColumnContent, ColumnContent("Trust & Safety", "markdown", True)])
89
  ts_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", False)])
90
  ts_eval_column_dict.append(["privacy", ColumnContent, ColumnContent("Privacy", "markdown", False)])
src/populate.py CHANGED
@@ -31,10 +31,9 @@ def get_leaderboard_df_crm(
31
  )
32
 
33
  leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
34
- leaderboard_ts__crm_bias_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_crm_bias.csv"))
35
  leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
36
- leaderboard_ts_df = leaderboard_ts_df.join(ref_df.set_index("Model Name"), on="Model Name")
37
- leaderboard_ts_df = leaderboard_ts_df.join(leaderboard_ts__crm_bias_df.set_index("Model Name"), on="Model Name")
38
  privacy_cols = leaderboard_ts_df[
39
  [
40
  "Privacy Zero-Shot Match Avoidance",
@@ -47,7 +46,7 @@ def get_leaderboard_df_crm(
47
  leaderboard_ts_df["Privacy"] = privacy_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
48
  leaderboard_ts_df["Bias No CI"] = leaderboard_ts_df["CRM Bias"].transform(lambda x: x.split(" ")[0])
49
 
50
- ts_cols = leaderboard_ts_df[
51
  [
52
  "Safety",
53
  "Privacy",
@@ -55,7 +54,14 @@ def get_leaderboard_df_crm(
55
  "Bias No CI",
56
  ]
57
  ].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
58
- leaderboard_ts_df["Trust & Safety"] = ts_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
 
 
 
 
 
 
 
59
 
60
  leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
61
  by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
 
31
  )
32
 
33
  leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
34
+ leaderboard_ts_crm_bias_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_crm_bias.csv"))
35
  leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
36
+ leaderboard_ts_df = leaderboard_ts_df.join(leaderboard_ts_crm_bias_df.set_index("Model Name"), on="Model Name")
 
37
  privacy_cols = leaderboard_ts_df[
38
  [
39
  "Privacy Zero-Shot Match Avoidance",
 
46
  leaderboard_ts_df["Privacy"] = privacy_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
47
  leaderboard_ts_df["Bias No CI"] = leaderboard_ts_df["CRM Bias"].transform(lambda x: x.split(" ")[0])
48
 
49
+ ts_lvl2_cols = leaderboard_ts_df[
50
  [
51
  "Safety",
52
  "Privacy",
 
54
  "Bias No CI",
55
  ]
56
  ].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
57
+ leaderboard_ts_df["Trust & Safety"] = ts_lvl2_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
58
+
59
+ leaderboard_accuracy_df = leaderboard_accuracy_df.join(
60
+ leaderboard_ts_df[ts_cols].set_index(["Model Name"]),
61
+ on=["Model Name"],
62
+ )
63
+
64
+ leaderboard_ts_df = leaderboard_ts_df.join(ref_df.set_index("Model Name"), on="Model Name")
65
 
66
  leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
67
  by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False