[email protected] commited on
Commit
d9df999
·
1 Parent(s): 8e18ea7
Files changed (1) hide show
  1. src/leaderboard/read_evals.py +7 -5
src/leaderboard/read_evals.py CHANGED
@@ -74,14 +74,15 @@ class EvalResult:
74
  # We average all scores of a given metric (not all metrics are present in all files)
75
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
- mean_acc = None
 
78
  else:
79
  mean_acc = np.nanmean(accs) * 100.0
80
  # if accs.size == 0:
81
  # continue
82
  # mean_acc = np.mean(accs) * 100.0
83
  results[task.benchmark] = mean_acc
84
- print(results)
85
  return self(
86
  eval_name=result_key,
87
  full_model=full_model,
@@ -112,7 +113,8 @@ class EvalResult:
112
 
113
  def to_dict(self, task_class):
114
  """Converts the Eval Result to a dict compatible with our dataframe display"""
115
- scores = [v for v in self.results.values() if v is not None]
 
116
  average = sum(scores) / len(scores)
117
  # average = sum([v for v in self.results.values() if v is not None]) / len(task_class)
118
  data_dict = {
@@ -184,8 +186,8 @@ def get_raw_eval_results(results_path: str, requests_path: str, task_class) -> l
184
  # Store results of same eval together
185
  eval_name = eval_result.eval_name
186
  if eval_name in eval_results.keys():
187
- # eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
188
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items()})
189
  else:
190
  eval_results[eval_name] = eval_result
191
 
 
74
  # We average all scores of a given metric (not all metrics are present in all files)
75
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
+ mean_acc = 0.0
78
+ # todo: None
79
  else:
80
  mean_acc = np.nanmean(accs) * 100.0
81
  # if accs.size == 0:
82
  # continue
83
  # mean_acc = np.mean(accs) * 100.0
84
  results[task.benchmark] = mean_acc
85
+ print(full_model, results)
86
  return self(
87
  eval_name=result_key,
88
  full_model=full_model,
 
113
 
114
  def to_dict(self, task_class):
115
  """Converts the Eval Result to a dict compatible with our dataframe display"""
116
+ # ignore uncertainty for overall calculation
117
+ scores = [v for k, v in self.results.values() if v is not None and k != 'uncertainty']
118
  average = sum(scores) / len(scores)
119
  # average = sum([v for v in self.results.values() if v is not None]) / len(task_class)
120
  data_dict = {
 
186
  # Store results of same eval together
187
  eval_name = eval_result.eval_name
188
  if eval_name in eval_results.keys():
189
+ eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
190
+ # eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items()})
191
  else:
192
  eval_results[eval_name] = eval_result
193