Spaces:
Running
Running
Correcting significance in `tournament_results` with FDR correction method
Browse files
server.py
CHANGED
@@ -167,6 +167,30 @@ def check_significance(model_a_path, model_b_path):
|
|
167 |
result = check_significance_wait_for_result(result_url)
|
168 |
return result
|
169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
class NoneLock:
|
171 |
def __init__(self, *args, **kwargs):
|
172 |
pass
|
@@ -543,6 +567,7 @@ class LeaderboardServer:
|
|
543 |
|
544 |
with self.var_lock.ro:
|
545 |
tournament_results = pre_submit.tournament_results if pre_submit else self.tournament_results
|
|
|
546 |
|
547 |
for competitor_id in tournament_results[submission_id].keys() - {submission_id}: # without self
|
548 |
if competitor_id not in self.submission_id_to_data:
|
@@ -560,7 +585,7 @@ class LeaderboardServer:
|
|
560 |
if to_csv:
|
561 |
match_results[task] = tournament_results[submission_id][competitor_id][task]["significant"]
|
562 |
else:
|
563 |
-
match_task_result_details = dict.fromkeys(["significant", "p_value"]) # order has impact to sorting DataFrame
|
564 |
match_task_result_details.update(copy.deepcopy(tournament_results[submission_id][competitor_id][task]))
|
565 |
match_task_result_details["significant"] = str(match_task_result_details["significant"]).lower() # originaly bool
|
566 |
match_task_result_significant = match_task_result_details["significant"]
|
@@ -611,6 +636,20 @@ class LeaderboardServer:
|
|
611 |
dataframe = dataframe.style.apply(self._model_tournament_table_highlight_true_and_false, axis=None)
|
612 |
return dataframe
|
613 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
614 |
def _dataframe_to_csv(self, dataframe, filename):
|
615 |
try:
|
616 |
if not os.path.isdir(self.DIR_DATAFRAMES_CSV):
|
@@ -689,6 +728,7 @@ class LeaderboardServer:
|
|
689 |
def _get_leaderboard(self, pre_submit=None, category=None, to_csv=False):
|
690 |
with self.var_lock.ro:
|
691 |
tournament_results = pre_submit.tournament_results if pre_submit else self.tournament_results
|
|
|
692 |
category = category if category else self.TASKS_CATEGORY_OVERALL
|
693 |
|
694 |
if len(tournament_results) == 0:
|
|
|
167 |
result = check_significance_wait_for_result(result_url)
|
168 |
return result
|
169 |
|
170 |
+
def correct_pvals_for_fdr(model_task_pvals, fdr_alpha=0.05, repeat_on_conn_timeout=10):
|
171 |
+
url = 'https://czechllm.fit.vutbr.cz/benczechmark-leaderboard/compare_significance/correct_pvals_for_fdr'
|
172 |
+
|
173 |
+
data = {
|
174 |
+
"pvals": model_task_pvals,
|
175 |
+
"fdr_alpha": fdr_alpha
|
176 |
+
}
|
177 |
+
|
178 |
+
# prepare and send request
|
179 |
+
response = check_significance_repeat_on_conn_timeout(
|
180 |
+
repeat_on_conn_timeout,
|
181 |
+
requests.post, url, json=data, timeout=60 * 5
|
182 |
+
)
|
183 |
+
|
184 |
+
# check response
|
185 |
+
if response.status_code == 200:
|
186 |
+
result = response.json()
|
187 |
+
elif response.status_code == 429:
|
188 |
+
raise CheckSignificanceError('Server is too busy. Please try again later.')
|
189 |
+
else:
|
190 |
+
raise CheckSignificanceError(f'Failed to submit task. Status code: {response.status_code}')
|
191 |
+
|
192 |
+
return result
|
193 |
+
|
194 |
class NoneLock:
|
195 |
def __init__(self, *args, **kwargs):
|
196 |
pass
|
|
|
567 |
|
568 |
with self.var_lock.ro:
|
569 |
tournament_results = pre_submit.tournament_results if pre_submit else self.tournament_results
|
570 |
+
tournament_results = self._correct_significance_in_tournament_results(tournament_results)
|
571 |
|
572 |
for competitor_id in tournament_results[submission_id].keys() - {submission_id}: # without self
|
573 |
if competitor_id not in self.submission_id_to_data:
|
|
|
585 |
if to_csv:
|
586 |
match_results[task] = tournament_results[submission_id][competitor_id][task]["significant"]
|
587 |
else:
|
588 |
+
match_task_result_details = dict.fromkeys(["significant", "corrected_p_value", "p_value"]) # order has impact to sorting DataFrame
|
589 |
match_task_result_details.update(copy.deepcopy(tournament_results[submission_id][competitor_id][task]))
|
590 |
match_task_result_details["significant"] = str(match_task_result_details["significant"]).lower() # originaly bool
|
591 |
match_task_result_significant = match_task_result_details["significant"]
|
|
|
636 |
dataframe = dataframe.style.apply(self._model_tournament_table_highlight_true_and_false, axis=None)
|
637 |
return dataframe
|
638 |
|
639 |
+
def _correct_significance_in_tournament_results(self, tournament_results, alpha=0.05):
|
640 |
+
tournament_results = copy.deepcopy(tournament_results)
|
641 |
+
|
642 |
+
for submission_id in tournament_results:
|
643 |
+
for task in self.TASKS_METADATA:
|
644 |
+
competitors = [competitor_id for competitor_id in tournament_results[submission_id].keys() - {submission_id}] # without self
|
645 |
+
model_task_pvals = [tournament_results[submission_id][competitor_id][task]["p_value"] for competitor_id in competitors]
|
646 |
+
corrected_model_task_pvals = correct_pvals_for_fdr(model_task_pvals)
|
647 |
+
for competitor_id, task_pval in zip(competitors, corrected_model_task_pvals):
|
648 |
+
tournament_results[submission_id][competitor_id][task]["corrected_p_value"] = task_pval
|
649 |
+
tournament_results[submission_id][competitor_id][task]["significant"] = bool(task_pval < alpha)
|
650 |
+
|
651 |
+
return tournament_results
|
652 |
+
|
653 |
def _dataframe_to_csv(self, dataframe, filename):
|
654 |
try:
|
655 |
if not os.path.isdir(self.DIR_DATAFRAMES_CSV):
|
|
|
728 |
def _get_leaderboard(self, pre_submit=None, category=None, to_csv=False):
|
729 |
with self.var_lock.ro:
|
730 |
tournament_results = pre_submit.tournament_results if pre_submit else self.tournament_results
|
731 |
+
tournament_results = self._correct_significance_in_tournament_results(tournament_results)
|
732 |
category = category if category else self.TASKS_CATEGORY_OVERALL
|
733 |
|
734 |
if len(tournament_results) == 0:
|