idolezal commited on
Commit
ff10e2a
ยท
1 Parent(s): 31d445e

Correcting significance in `tournament_results` with FDR correction method

Browse files
Files changed (1) hide show
  1. server.py +41 -1
server.py CHANGED
@@ -167,6 +167,30 @@ def check_significance(model_a_path, model_b_path):
167
  result = check_significance_wait_for_result(result_url)
168
  return result
169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  class NoneLock:
171
  def __init__(self, *args, **kwargs):
172
  pass
@@ -543,6 +567,7 @@ class LeaderboardServer:
543
 
544
  with self.var_lock.ro:
545
  tournament_results = pre_submit.tournament_results if pre_submit else self.tournament_results
 
546
 
547
  for competitor_id in tournament_results[submission_id].keys() - {submission_id}: # without self
548
  if competitor_id not in self.submission_id_to_data:
@@ -560,7 +585,7 @@ class LeaderboardServer:
560
  if to_csv:
561
  match_results[task] = tournament_results[submission_id][competitor_id][task]["significant"]
562
  else:
563
- match_task_result_details = dict.fromkeys(["significant", "p_value"]) # order has impact to sorting DataFrame
564
  match_task_result_details.update(copy.deepcopy(tournament_results[submission_id][competitor_id][task]))
565
  match_task_result_details["significant"] = str(match_task_result_details["significant"]).lower() # originaly bool
566
  match_task_result_significant = match_task_result_details["significant"]
@@ -611,6 +636,20 @@ class LeaderboardServer:
611
  dataframe = dataframe.style.apply(self._model_tournament_table_highlight_true_and_false, axis=None)
612
  return dataframe
613
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
614
  def _dataframe_to_csv(self, dataframe, filename):
615
  try:
616
  if not os.path.isdir(self.DIR_DATAFRAMES_CSV):
@@ -689,6 +728,7 @@ class LeaderboardServer:
689
  def _get_leaderboard(self, pre_submit=None, category=None, to_csv=False):
690
  with self.var_lock.ro:
691
  tournament_results = pre_submit.tournament_results if pre_submit else self.tournament_results
 
692
  category = category if category else self.TASKS_CATEGORY_OVERALL
693
 
694
  if len(tournament_results) == 0:
 
167
  result = check_significance_wait_for_result(result_url)
168
  return result
169
 
170
+ def correct_pvals_for_fdr(model_task_pvals, fdr_alpha=0.05, repeat_on_conn_timeout=10):
171
+ url = 'https://czechllm.fit.vutbr.cz/benczechmark-leaderboard/compare_significance/correct_pvals_for_fdr'
172
+
173
+ data = {
174
+ "pvals": model_task_pvals,
175
+ "fdr_alpha": fdr_alpha
176
+ }
177
+
178
+ # prepare and send request
179
+ response = check_significance_repeat_on_conn_timeout(
180
+ repeat_on_conn_timeout,
181
+ requests.post, url, json=data, timeout=60 * 5
182
+ )
183
+
184
+ # check response
185
+ if response.status_code == 200:
186
+ result = response.json()
187
+ elif response.status_code == 429:
188
+ raise CheckSignificanceError('Server is too busy. Please try again later.')
189
+ else:
190
+ raise CheckSignificanceError(f'Failed to submit task. Status code: {response.status_code}')
191
+
192
+ return result
193
+
194
  class NoneLock:
195
  def __init__(self, *args, **kwargs):
196
  pass
 
567
 
568
  with self.var_lock.ro:
569
  tournament_results = pre_submit.tournament_results if pre_submit else self.tournament_results
570
+ tournament_results = self._correct_significance_in_tournament_results(tournament_results)
571
 
572
  for competitor_id in tournament_results[submission_id].keys() - {submission_id}: # without self
573
  if competitor_id not in self.submission_id_to_data:
 
585
  if to_csv:
586
  match_results[task] = tournament_results[submission_id][competitor_id][task]["significant"]
587
  else:
588
+ match_task_result_details = dict.fromkeys(["significant", "corrected_p_value", "p_value"]) # order has impact to sorting DataFrame
589
  match_task_result_details.update(copy.deepcopy(tournament_results[submission_id][competitor_id][task]))
590
  match_task_result_details["significant"] = str(match_task_result_details["significant"]).lower() # originaly bool
591
  match_task_result_significant = match_task_result_details["significant"]
 
636
  dataframe = dataframe.style.apply(self._model_tournament_table_highlight_true_and_false, axis=None)
637
  return dataframe
638
 
639
+ def _correct_significance_in_tournament_results(self, tournament_results, alpha=0.05):
640
+ tournament_results = copy.deepcopy(tournament_results)
641
+
642
+ for submission_id in tournament_results:
643
+ for task in self.TASKS_METADATA:
644
+ competitors = [competitor_id for competitor_id in tournament_results[submission_id].keys() - {submission_id}] # without self
645
+ model_task_pvals = [tournament_results[submission_id][competitor_id][task]["p_value"] for competitor_id in competitors]
646
+ corrected_model_task_pvals = correct_pvals_for_fdr(model_task_pvals)
647
+ for competitor_id, task_pval in zip(competitors, corrected_model_task_pvals):
648
+ tournament_results[submission_id][competitor_id][task]["corrected_p_value"] = task_pval
649
+ tournament_results[submission_id][competitor_id][task]["significant"] = bool(task_pval < alpha)
650
+
651
+ return tournament_results
652
+
653
  def _dataframe_to_csv(self, dataframe, filename):
654
  try:
655
  if not os.path.isdir(self.DIR_DATAFRAMES_CSV):
 
728
  def _get_leaderboard(self, pre_submit=None, category=None, to_csv=False):
729
  with self.var_lock.ro:
730
  tournament_results = pre_submit.tournament_results if pre_submit else self.tournament_results
731
+ tournament_results = self._correct_significance_in_tournament_results(tournament_results)
732
  category = category if category else self.TASKS_CATEGORY_OVERALL
733
 
734
  if len(tournament_results) == 0: