add instability score
Browse files
app.py
CHANGED
@@ -366,6 +366,7 @@ def get_leaderboard_data(feedback_entry=None):
|
|
366 |
"Rank",
|
367 |
"Model",
|
368 |
"Elo Score",
|
|
|
369 |
"Average Win Rate",
|
370 |
"Bradley-Terry Coefficient",
|
371 |
"Eigenvector Centrality Value",
|
@@ -402,12 +403,31 @@ def get_leaderboard_data(feedback_entry=None):
|
|
402 |
pagerank_result = evalica.pagerank(
|
403 |
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
404 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
405 |
|
406 |
# Combine all results into a single DataFrame
|
407 |
leaderboard_data = pd.DataFrame(
|
408 |
{
|
409 |
"Model": elo_result.scores.index,
|
410 |
"Elo Score": elo_result.scores.values,
|
|
|
411 |
"Average Win Rate": avr_result.scores.values * 100,
|
412 |
"Bradley-Terry Coefficient": bt_result.scores.values,
|
413 |
"Eigenvector Centrality Value": eigen_result.scores.values,
|
@@ -420,6 +440,7 @@ def get_leaderboard_data(feedback_entry=None):
|
|
420 |
leaderboard_data = leaderboard_data.round(
|
421 |
{
|
422 |
"Elo Score": 2,
|
|
|
423 |
"Average Win Rate": 2,
|
424 |
"Bradley-Terry Coefficient": 2,
|
425 |
"Eigenvector Centrality Value": 2,
|
@@ -471,11 +492,12 @@ with gr.Blocks() as app:
|
|
471 |
"Rank",
|
472 |
"Model",
|
473 |
"Elo Score",
|
474 |
-
"
|
475 |
],
|
476 |
search_columns=["Model"],
|
477 |
filter_columns=[
|
478 |
"Elo Score",
|
|
|
479 |
"Average Win Rate",
|
480 |
"Bradley-Terry Coefficient",
|
481 |
"Eigenvector Centrality Value",
|
|
|
366 |
"Rank",
|
367 |
"Model",
|
368 |
"Elo Score",
|
369 |
+
"Instability Score",
|
370 |
"Average Win Rate",
|
371 |
"Bradley-Terry Coefficient",
|
372 |
"Eigenvector Centrality Value",
|
|
|
403 |
pagerank_result = evalica.pagerank(
|
404 |
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
405 |
)
|
406 |
+
|
407 |
+
# Calculate instability score as a pandas Series aligned with other metrics
|
408 |
+
is_result = pd.Series(0.0, index=elo_result.scores.index) # Initialize with zeros using same index
|
409 |
+
|
410 |
+
# Loop through models and update values
|
411 |
+
for model in is_result.index:
|
412 |
+
# Filter self-matches for this model
|
413 |
+
self_matches = feedback_df[
|
414 |
+
(feedback_df["left"] == model) &
|
415 |
+
(feedback_df["right"] == model)
|
416 |
+
]
|
417 |
+
total = len(self_matches)
|
418 |
+
|
419 |
+
if total:
|
420 |
+
# Count non-draw outcomes (wins or losses)
|
421 |
+
non_draws = self_matches[self_matches["winner"] != evalica.Winner.Draw].shape[0]
|
422 |
+
# Store as percentage directly
|
423 |
+
is_result[model] = non_draws / total
|
424 |
|
425 |
# Combine all results into a single DataFrame
|
426 |
leaderboard_data = pd.DataFrame(
|
427 |
{
|
428 |
"Model": elo_result.scores.index,
|
429 |
"Elo Score": elo_result.scores.values,
|
430 |
+
"Instability Score": is_result.values * 100,
|
431 |
"Average Win Rate": avr_result.scores.values * 100,
|
432 |
"Bradley-Terry Coefficient": bt_result.scores.values,
|
433 |
"Eigenvector Centrality Value": eigen_result.scores.values,
|
|
|
440 |
leaderboard_data = leaderboard_data.round(
|
441 |
{
|
442 |
"Elo Score": 2,
|
443 |
+
"Instability Score": 2,
|
444 |
"Average Win Rate": 2,
|
445 |
"Bradley-Terry Coefficient": 2,
|
446 |
"Eigenvector Centrality Value": 2,
|
|
|
492 |
"Rank",
|
493 |
"Model",
|
494 |
"Elo Score",
|
495 |
+
"Instability Score",
|
496 |
],
|
497 |
search_columns=["Model"],
|
498 |
filter_columns=[
|
499 |
"Elo Score",
|
500 |
+
"Instability Score",
|
501 |
"Average Win Rate",
|
502 |
"Bradley-Terry Coefficient",
|
503 |
"Eigenvector Centrality Value",
|