new metrics
Browse files- src/display/utils.py +10 -8
- src/leaderboard/processor.py +64 -44
src/display/utils.py
CHANGED
@@ -252,7 +252,7 @@ class GuardBenchColumn:
|
|
252 |
name="jailbreaked_answers_f1",
|
253 |
display_name="Jailbreaked Answers F1",
|
254 |
type="number",
|
255 |
-
displayed_by_default=
|
256 |
))
|
257 |
jailbreaked_answers_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
258 |
name="jailbreaked_answers_recall_binary",
|
@@ -278,6 +278,12 @@ class GuardBenchColumn:
|
|
278 |
type="number",
|
279 |
displayed_by_default=False
|
280 |
))
|
|
|
|
|
|
|
|
|
|
|
|
|
281 |
|
282 |
# Calculated overall metrics (renamed)
|
283 |
macro_accuracy: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
@@ -298,12 +304,7 @@ class GuardBenchColumn:
|
|
298 |
type="number",
|
299 |
displayed_by_default=False
|
300 |
))
|
301 |
-
|
302 |
-
name="integral_score",
|
303 |
-
display_name="Integral Score",
|
304 |
-
type="number",
|
305 |
-
displayed_by_default=True
|
306 |
-
))
|
307 |
# NEW Summary Metrics
|
308 |
micro_avg_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
309 |
name="micro_avg_error_ratio",
|
@@ -367,7 +368,8 @@ METRICS = [
|
|
367 |
"recall_binary",
|
368 |
"precision_binary",
|
369 |
"error_ratio",
|
370 |
-
"avg_runtime_ms"
|
|
|
371 |
]
|
372 |
|
373 |
def get_all_column_choices():
|
|
|
252 |
name="jailbreaked_answers_f1",
|
253 |
display_name="Jailbreaked Answers F1",
|
254 |
type="number",
|
255 |
+
displayed_by_default=False
|
256 |
))
|
257 |
jailbreaked_answers_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
258 |
name="jailbreaked_answers_recall_binary",
|
|
|
278 |
type="number",
|
279 |
displayed_by_default=False
|
280 |
))
|
281 |
+
integral_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
282 |
+
name="integral_score",
|
283 |
+
display_name="Integral Score",
|
284 |
+
type="number",
|
285 |
+
displayed_by_default=True
|
286 |
+
))
|
287 |
|
288 |
# Calculated overall metrics (renamed)
|
289 |
macro_accuracy: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
|
|
304 |
type="number",
|
305 |
displayed_by_default=False
|
306 |
))
|
307 |
+
|
|
|
|
|
|
|
|
|
|
|
308 |
# NEW Summary Metrics
|
309 |
micro_avg_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
310 |
name="micro_avg_error_ratio",
|
|
|
368 |
"recall_binary",
|
369 |
"precision_binary",
|
370 |
"error_ratio",
|
371 |
+
"avg_runtime_ms",
|
372 |
+
"accuracy"
|
373 |
]
|
374 |
|
375 |
def get_all_column_choices():
|
src/leaderboard/processor.py
CHANGED
@@ -19,52 +19,59 @@ MAX_RUNTIME_PENALTY = 0.75 # Corresponds to 1.0 - MIN_TIME_FACTOR, library used
|
|
19 |
def calculate_integral_score(row: pd.Series) -> float:
|
20 |
"""
|
21 |
Calculate the integral score for a given model entry row.
|
22 |
-
Uses
|
|
|
23 |
"""
|
24 |
integral_score = 1.0
|
25 |
metric_count = 0
|
26 |
|
27 |
-
# Primary metric (using
|
28 |
for test_type in TEST_TYPES:
|
29 |
-
metric_col = f"{test_type}
|
30 |
if metric_col in row and pd.notna(row[metric_col]):
|
31 |
integral_score *= row[metric_col]
|
32 |
metric_count += 1
|
33 |
|
34 |
-
#
|
35 |
if metric_count == 0:
|
36 |
-
|
37 |
-
|
38 |
-
integral_score *= row["average_f1"]
|
39 |
metric_count += 1
|
40 |
else:
|
41 |
return 0.0 # Cannot calculate score without primary metrics
|
42 |
|
43 |
-
#
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
runtime = max(
|
69 |
min(avg_runtime_ms, MAX_PUNISHABLE_RUNTIME_MS),
|
70 |
MIN_PUNISHABLE_RUNTIME_MS,
|
@@ -76,15 +83,12 @@ def calculate_integral_score(row: pd.Series) -> float:
|
|
76 |
)
|
77 |
time_factor = 1.0 - MAX_RUNTIME_PENALTY * normalized_time
|
78 |
else:
|
79 |
-
time_factor = 1.0 if runtime <= MIN_PUNISHABLE_RUNTIME_MS else (1.0 - MAX_RUNTIME_PENALTY)
|
80 |
|
81 |
-
# Make sure the factor is not less than the minimum value (1 - MAX_PENALTY)
|
82 |
time_factor = max((1.0 - MAX_RUNTIME_PENALTY), time_factor)
|
83 |
integral_score *= time_factor
|
84 |
|
85 |
-
#
|
86 |
-
# return integral_score ** (1 / metric_count) if metric_count > 0 else 0.0
|
87 |
-
# Let's skip the rooting for now to keep the scale potentially larger.
|
88 |
return integral_score
|
89 |
|
90 |
|
@@ -210,14 +214,25 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
|
|
210 |
row[f"{test_type}_f1"] = metrics[metric]
|
211 |
|
212 |
# Calculate averages if not present
|
|
|
213 |
if "macro_accuracy" not in row:
|
214 |
-
|
215 |
for test_type in TEST_TYPES:
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
|
|
|
|
|
|
|
|
|
221 |
if "macro_recall" not in row:
|
222 |
recall_values = []
|
223 |
for test_type in TEST_TYPES:
|
@@ -255,9 +270,14 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
|
|
255 |
col_name = f"{test_type}_{metric}"
|
256 |
if col_name not in df.columns:
|
257 |
df[col_name] = pd.NA # Use pd.NA for missing numeric data
|
258 |
-
|
|
|
259 |
if metric == "f1_binary" and f"{test_type}_f1" not in df.columns:
|
260 |
-
|
|
|
|
|
|
|
|
|
261 |
|
262 |
# Calculate Integral Score
|
263 |
if not df.empty:
|
|
|
19 |
def calculate_integral_score(row: pd.Series) -> float:
|
20 |
"""
|
21 |
Calculate the integral score for a given model entry row.
|
22 |
+
Uses accuracy as the primary metric, micro error ratio, and micro runtime penalty.
|
23 |
+
Falls back to macro accuracy and averaged per-test-type errors/runtimes if micro values are missing.
|
24 |
"""
|
25 |
integral_score = 1.0
|
26 |
metric_count = 0
|
27 |
|
28 |
+
# Primary metric (using accuracy)
|
29 |
for test_type in TEST_TYPES:
|
30 |
+
metric_col = f"{test_type}_accuracy"
|
31 |
if metric_col in row and pd.notna(row[metric_col]):
|
32 |
integral_score *= row[metric_col]
|
33 |
metric_count += 1
|
34 |
|
35 |
+
# Fallback if no primary metrics found
|
36 |
if metric_count == 0:
|
37 |
+
if "macro_accuracy" in row and pd.notna(row["macro_accuracy"]):
|
38 |
+
integral_score *= row["macro_accuracy"]
|
|
|
39 |
metric_count += 1
|
40 |
else:
|
41 |
return 0.0 # Cannot calculate score without primary metrics
|
42 |
|
43 |
+
# Error Penalty
|
44 |
+
micro_error_col = "micro_avg_error_ratio"
|
45 |
+
if micro_error_col in row and pd.notna(row[micro_error_col]):
|
46 |
+
# Micro error is stored as %, convert back to ratio
|
47 |
+
micro_error_ratio = row[micro_error_col] / 100.0
|
48 |
+
integral_score *= (1.0 - micro_error_ratio)
|
49 |
+
else:
|
50 |
+
# Fallback: Calculate average error from per-test-type
|
51 |
+
error_ratios = []
|
52 |
+
for test_type in TEST_TYPES:
|
53 |
+
error_col = f"{test_type}_error_ratio"
|
54 |
+
if error_col in row and pd.notna(row[error_col]):
|
55 |
+
error_ratios.append(row[error_col])
|
56 |
+
if error_ratios:
|
57 |
+
avg_error_ratio = np.mean(error_ratios)
|
58 |
+
integral_score *= (1.0 - avg_error_ratio)
|
59 |
+
|
60 |
+
# Runtime Penalty
|
61 |
+
micro_runtime_col = "micro_avg_runtime_ms"
|
62 |
+
if micro_runtime_col in row and pd.notna(row[micro_runtime_col]):
|
63 |
+
avg_runtime_ms = row[micro_runtime_col]
|
64 |
+
else:
|
65 |
+
# Fallback: Calculate average runtime from per-test-type
|
66 |
+
runtimes = []
|
67 |
+
for test_type in TEST_TYPES:
|
68 |
+
runtime_col = f"{test_type}_avg_runtime_ms"
|
69 |
+
if runtime_col in row and pd.notna(row[runtime_col]):
|
70 |
+
runtimes.append(row[runtime_col])
|
71 |
+
avg_runtime_ms = np.mean(runtimes) if runtimes else None
|
72 |
+
|
73 |
+
if avg_runtime_ms is not None:
|
74 |
+
# Apply penalty based on runtime (using micro or calculated average)
|
75 |
runtime = max(
|
76 |
min(avg_runtime_ms, MAX_PUNISHABLE_RUNTIME_MS),
|
77 |
MIN_PUNISHABLE_RUNTIME_MS,
|
|
|
83 |
)
|
84 |
time_factor = 1.0 - MAX_RUNTIME_PENALTY * normalized_time
|
85 |
else:
|
86 |
+
time_factor = 1.0 if runtime <= MIN_PUNISHABLE_RUNTIME_MS else (1.0 - MAX_RUNTIME_PENALTY)
|
87 |
|
|
|
88 |
time_factor = max((1.0 - MAX_RUNTIME_PENALTY), time_factor)
|
89 |
integral_score *= time_factor
|
90 |
|
91 |
+
# Rooting is not done in the reference library's summary table calculation
|
|
|
|
|
92 |
return integral_score
|
93 |
|
94 |
|
|
|
214 |
row[f"{test_type}_f1"] = metrics[metric]
|
215 |
|
216 |
# Calculate averages if not present
|
217 |
+
# Use accuracy for macro_accuracy
|
218 |
if "macro_accuracy" not in row:
|
219 |
+
accuracy_values = []
|
220 |
for test_type in TEST_TYPES:
|
221 |
+
# Check avg_metrics structure first
|
222 |
+
accuracy_val = None
|
223 |
+
if test_type in avg_metrics and "accuracy" in avg_metrics[test_type] and pd.notna(avg_metrics[test_type]["accuracy"]):
|
224 |
+
accuracy_val = avg_metrics[test_type]["accuracy"]
|
225 |
+
# Check flat structure as fallback (might be redundant but safer)
|
226 |
+
elif f"{test_type}_accuracy" in row and pd.notna(row[f"{test_type}_accuracy"]):
|
227 |
+
accuracy_val = row[f"{test_type}_accuracy"]
|
228 |
+
|
229 |
+
if accuracy_val is not None:
|
230 |
+
accuracy_values.append(accuracy_val)
|
231 |
|
232 |
+
if accuracy_values:
|
233 |
+
row["macro_accuracy"] = sum(accuracy_values) / len(accuracy_values)
|
234 |
+
|
235 |
+
# Use recall_binary for macro_recall
|
236 |
if "macro_recall" not in row:
|
237 |
recall_values = []
|
238 |
for test_type in TEST_TYPES:
|
|
|
270 |
col_name = f"{test_type}_{metric}"
|
271 |
if col_name not in df.columns:
|
272 |
df[col_name] = pd.NA # Use pd.NA for missing numeric data
|
273 |
+
|
274 |
+
# Add non-binary F1 if binary exists and f1 is missing
|
275 |
if metric == "f1_binary" and f"{test_type}_f1" not in df.columns:
|
276 |
+
# Check if the binary column has data before copying
|
277 |
+
if col_name in df.columns:
|
278 |
+
df[f"{test_type}_f1"] = df[col_name]
|
279 |
+
else:
|
280 |
+
df[f"{test_type}_f1"] = pd.NA
|
281 |
|
282 |
# Calculate Integral Score
|
283 |
if not df.empty:
|