apsys commited on
Commit
a3c3e83
·
1 Parent(s): 5565a34

new metrics

Browse files
Files changed (2) hide show
  1. src/display/utils.py +10 -8
  2. src/leaderboard/processor.py +64 -44
src/display/utils.py CHANGED
@@ -252,7 +252,7 @@ class GuardBenchColumn:
252
  name="jailbreaked_answers_f1",
253
  display_name="Jailbreaked Answers F1",
254
  type="number",
255
- displayed_by_default=True
256
  ))
257
  jailbreaked_answers_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
258
  name="jailbreaked_answers_recall_binary",
@@ -278,6 +278,12 @@ class GuardBenchColumn:
278
  type="number",
279
  displayed_by_default=False
280
  ))
 
 
 
 
 
 
281
 
282
  # Calculated overall metrics (renamed)
283
  macro_accuracy: ColumnInfo = field(default_factory=lambda: ColumnInfo(
@@ -298,12 +304,7 @@ class GuardBenchColumn:
298
  type="number",
299
  displayed_by_default=False
300
  ))
301
- integral_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
302
- name="integral_score",
303
- display_name="Integral Score",
304
- type="number",
305
- displayed_by_default=True
306
- ))
307
  # NEW Summary Metrics
308
  micro_avg_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
309
  name="micro_avg_error_ratio",
@@ -367,7 +368,8 @@ METRICS = [
367
  "recall_binary",
368
  "precision_binary",
369
  "error_ratio",
370
- "avg_runtime_ms"
 
371
  ]
372
 
373
  def get_all_column_choices():
 
252
  name="jailbreaked_answers_f1",
253
  display_name="Jailbreaked Answers F1",
254
  type="number",
255
+ displayed_by_default=False
256
  ))
257
  jailbreaked_answers_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
258
  name="jailbreaked_answers_recall_binary",
 
278
  type="number",
279
  displayed_by_default=False
280
  ))
281
+ integral_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
282
+ name="integral_score",
283
+ display_name="Integral Score",
284
+ type="number",
285
+ displayed_by_default=True
286
+ ))
287
 
288
  # Calculated overall metrics (renamed)
289
  macro_accuracy: ColumnInfo = field(default_factory=lambda: ColumnInfo(
 
304
  type="number",
305
  displayed_by_default=False
306
  ))
307
+
 
 
 
 
 
308
  # NEW Summary Metrics
309
  micro_avg_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
310
  name="micro_avg_error_ratio",
 
368
  "recall_binary",
369
  "precision_binary",
370
  "error_ratio",
371
+ "avg_runtime_ms",
372
+ "accuracy"
373
  ]
374
 
375
  def get_all_column_choices():
src/leaderboard/processor.py CHANGED
@@ -19,52 +19,59 @@ MAX_RUNTIME_PENALTY = 0.75 # Corresponds to 1.0 - MIN_TIME_FACTOR, library used
19
  def calculate_integral_score(row: pd.Series) -> float:
20
  """
21
  Calculate the integral score for a given model entry row.
22
- Uses F1-binary as the primary metric, error ratio, and runtime penalty.
 
23
  """
24
  integral_score = 1.0
25
  metric_count = 0
26
 
27
- # Primary metric (using f1_binary, could be changed to accuracy if needed)
28
  for test_type in TEST_TYPES:
29
- metric_col = f"{test_type}_f1_binary"
30
  if metric_col in row and pd.notna(row[metric_col]):
31
  integral_score *= row[metric_col]
32
  metric_count += 1
33
 
34
- # If no primary metrics found, return 0
35
  if metric_count == 0:
36
- # Check for average_f1 as a fallback
37
- if "average_f1" in row and pd.notna(row["average_f1"]):
38
- integral_score *= row["average_f1"]
39
  metric_count += 1
40
  else:
41
  return 0.0 # Cannot calculate score without primary metrics
42
 
43
- # Account for average errors across all test types (using a simple average for now)
44
- # This requires micro-level error data which isn't directly in avg_metrics.
45
- # We'll approximate using the average of available error ratios.
46
- error_ratios = []
47
- for test_type in TEST_TYPES:
48
- error_col = f"{test_type}_error_ratio"
49
- if error_col in row and pd.notna(row[error_col]):
50
- error_ratios.append(row[error_col])
51
-
52
- if error_ratios:
53
- avg_error_ratio = np.mean(error_ratios)
54
- integral_score *= (1.0 - avg_error_ratio)
55
-
56
- # Account for average runtime across all test types (using a simple average for now)
57
- # This requires micro-level runtime data. We'll approximate.
58
- runtimes = []
59
- for test_type in TEST_TYPES:
60
- runtime_col = f"{test_type}_avg_runtime_ms"
61
- if runtime_col in row and pd.notna(row[runtime_col]):
62
- runtimes.append(row[runtime_col])
63
-
64
- if runtimes:
65
- avg_runtime_ms = np.mean(runtimes)
66
-
67
- # Apply penalty based on runtime
 
 
 
 
 
 
 
68
  runtime = max(
69
  min(avg_runtime_ms, MAX_PUNISHABLE_RUNTIME_MS),
70
  MIN_PUNISHABLE_RUNTIME_MS,
@@ -76,15 +83,12 @@ def calculate_integral_score(row: pd.Series) -> float:
76
  )
77
  time_factor = 1.0 - MAX_RUNTIME_PENALTY * normalized_time
78
  else:
79
- time_factor = 1.0 if runtime <= MIN_PUNISHABLE_RUNTIME_MS else (1.0 - MAX_RUNTIME_PENALTY) # Assign max penalty if runtime exceeds min when max==min
80
 
81
- # Make sure the factor is not less than the minimum value (1 - MAX_PENALTY)
82
  time_factor = max((1.0 - MAX_RUNTIME_PENALTY), time_factor)
83
  integral_score *= time_factor
84
 
85
- # Root the score by the number of primary metrics used? (Optional, library did this)
86
- # return integral_score ** (1 / metric_count) if metric_count > 0 else 0.0
87
- # Let's skip the rooting for now to keep the scale potentially larger.
88
  return integral_score
89
 
90
 
@@ -210,14 +214,25 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
210
  row[f"{test_type}_f1"] = metrics[metric]
211
 
212
  # Calculate averages if not present
 
213
  if "macro_accuracy" not in row:
214
- f1_values = []
215
  for test_type in TEST_TYPES:
216
- if test_type in avg_metrics and "f1_binary" in avg_metrics[test_type] and pd.notna(avg_metrics[test_type]["f1_binary"]):
217
- f1_values.append(avg_metrics[test_type]["f1_binary"])
218
- if f1_values:
219
- row["macro_accuracy"] = sum(f1_values) / len(f1_values)
 
 
 
 
 
 
220
 
 
 
 
 
221
  if "macro_recall" not in row:
222
  recall_values = []
223
  for test_type in TEST_TYPES:
@@ -255,9 +270,14 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
255
  col_name = f"{test_type}_{metric}"
256
  if col_name not in df.columns:
257
  df[col_name] = pd.NA # Use pd.NA for missing numeric data
258
- # Add non-binary F1 if binary exists
 
259
  if metric == "f1_binary" and f"{test_type}_f1" not in df.columns:
260
- df[f"{test_type}_f1"] = df[col_name] # Copy f1_binary to f1 if f1 is missing
 
 
 
 
261
 
262
  # Calculate Integral Score
263
  if not df.empty:
 
19
  def calculate_integral_score(row: pd.Series) -> float:
20
  """
21
  Calculate the integral score for a given model entry row.
22
+ Uses accuracy as the primary metric, micro error ratio, and micro runtime penalty.
23
+ Falls back to macro accuracy and averaged per-test-type errors/runtimes if micro values are missing.
24
  """
25
  integral_score = 1.0
26
  metric_count = 0
27
 
28
+ # Primary metric (using accuracy)
29
  for test_type in TEST_TYPES:
30
+ metric_col = f"{test_type}_accuracy"
31
  if metric_col in row and pd.notna(row[metric_col]):
32
  integral_score *= row[metric_col]
33
  metric_count += 1
34
 
35
+ # Fallback if no primary metrics found
36
  if metric_count == 0:
37
+ if "macro_accuracy" in row and pd.notna(row["macro_accuracy"]):
38
+ integral_score *= row["macro_accuracy"]
 
39
  metric_count += 1
40
  else:
41
  return 0.0 # Cannot calculate score without primary metrics
42
 
43
+ # Error Penalty
44
+ micro_error_col = "micro_avg_error_ratio"
45
+ if micro_error_col in row and pd.notna(row[micro_error_col]):
46
+ # Micro error is stored as %, convert back to ratio
47
+ micro_error_ratio = row[micro_error_col] / 100.0
48
+ integral_score *= (1.0 - micro_error_ratio)
49
+ else:
50
+ # Fallback: Calculate average error from per-test-type
51
+ error_ratios = []
52
+ for test_type in TEST_TYPES:
53
+ error_col = f"{test_type}_error_ratio"
54
+ if error_col in row and pd.notna(row[error_col]):
55
+ error_ratios.append(row[error_col])
56
+ if error_ratios:
57
+ avg_error_ratio = np.mean(error_ratios)
58
+ integral_score *= (1.0 - avg_error_ratio)
59
+
60
+ # Runtime Penalty
61
+ micro_runtime_col = "micro_avg_runtime_ms"
62
+ if micro_runtime_col in row and pd.notna(row[micro_runtime_col]):
63
+ avg_runtime_ms = row[micro_runtime_col]
64
+ else:
65
+ # Fallback: Calculate average runtime from per-test-type
66
+ runtimes = []
67
+ for test_type in TEST_TYPES:
68
+ runtime_col = f"{test_type}_avg_runtime_ms"
69
+ if runtime_col in row and pd.notna(row[runtime_col]):
70
+ runtimes.append(row[runtime_col])
71
+ avg_runtime_ms = np.mean(runtimes) if runtimes else None
72
+
73
+ if avg_runtime_ms is not None:
74
+ # Apply penalty based on runtime (using micro or calculated average)
75
  runtime = max(
76
  min(avg_runtime_ms, MAX_PUNISHABLE_RUNTIME_MS),
77
  MIN_PUNISHABLE_RUNTIME_MS,
 
83
  )
84
  time_factor = 1.0 - MAX_RUNTIME_PENALTY * normalized_time
85
  else:
86
+ time_factor = 1.0 if runtime <= MIN_PUNISHABLE_RUNTIME_MS else (1.0 - MAX_RUNTIME_PENALTY)
87
 
 
88
  time_factor = max((1.0 - MAX_RUNTIME_PENALTY), time_factor)
89
  integral_score *= time_factor
90
 
91
+ # Rooting is not done in the reference library's summary table calculation
 
 
92
  return integral_score
93
 
94
 
 
214
  row[f"{test_type}_f1"] = metrics[metric]
215
 
216
  # Calculate averages if not present
217
+ # Use accuracy for macro_accuracy
218
  if "macro_accuracy" not in row:
219
+ accuracy_values = []
220
  for test_type in TEST_TYPES:
221
+ # Check avg_metrics structure first
222
+ accuracy_val = None
223
+ if test_type in avg_metrics and "accuracy" in avg_metrics[test_type] and pd.notna(avg_metrics[test_type]["accuracy"]):
224
+ accuracy_val = avg_metrics[test_type]["accuracy"]
225
+ # Check flat structure as fallback (might be redundant but safer)
226
+ elif f"{test_type}_accuracy" in row and pd.notna(row[f"{test_type}_accuracy"]):
227
+ accuracy_val = row[f"{test_type}_accuracy"]
228
+
229
+ if accuracy_val is not None:
230
+ accuracy_values.append(accuracy_val)
231
 
232
+ if accuracy_values:
233
+ row["macro_accuracy"] = sum(accuracy_values) / len(accuracy_values)
234
+
235
+ # Use recall_binary for macro_recall
236
  if "macro_recall" not in row:
237
  recall_values = []
238
  for test_type in TEST_TYPES:
 
270
  col_name = f"{test_type}_{metric}"
271
  if col_name not in df.columns:
272
  df[col_name] = pd.NA # Use pd.NA for missing numeric data
273
+
274
+ # Add non-binary F1 if binary exists and f1 is missing
275
  if metric == "f1_binary" and f"{test_type}_f1" not in df.columns:
276
+ # Check if the binary column has data before copying
277
+ if col_name in df.columns:
278
+ df[f"{test_type}_f1"] = df[col_name]
279
+ else:
280
+ df[f"{test_type}_f1"] = pd.NA
281
 
282
  # Calculate Integral Score
283
  if not df.empty: