Update app.py
Browse files
app.py
CHANGED
@@ -419,63 +419,71 @@ def get_leaderboard_data(vote_entry=None):
|
|
419 |
vote_df, conversation_df, on=["timestamp", "left", "right"], how="inner"
|
420 |
)
|
421 |
|
422 |
-
#
|
423 |
-
|
424 |
-
model_rcs_sum = {}
|
425 |
-
model_rcs_max = {}
|
426 |
|
427 |
# Process each row once and accumulate scores
|
428 |
for _, row in all_df.iterrows():
|
429 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
430 |
match row["winner"]:
|
431 |
case evalica.Winner.X:
|
432 |
-
left_score = 1
|
433 |
-
right_score = -1
|
434 |
case evalica.Winner.Y:
|
435 |
-
left_score = -1
|
436 |
-
right_score = 1
|
437 |
case _: # Draw
|
438 |
left_score = 0.1
|
439 |
right_score = 0.1
|
440 |
-
|
441 |
# Count rounds for each side
|
442 |
left_round = sum(1 for msg in row["left_chat"] if msg["role"] == "assistant")
|
443 |
right_round = sum(1 for msg in row["right_chat"] if msg["role"] == "assistant")
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
self_matches = vote_df[vote_df["left"] == vote_df["right"]]
|
469 |
-
model_matches = self_matches.groupby("left")
|
470 |
-
draw_counts = model_matches["winner"].apply(
|
471 |
-
lambda x: (x == evalica.Winner.Draw).sum()
|
472 |
-
)
|
473 |
-
total_counts = model_matches.size()
|
474 |
-
mcs_result = (
|
475 |
-
(draw_counts / total_counts)
|
476 |
-
.round(2)
|
477 |
-
.reindex(elo_result.scores.index, fill_value="N/A")
|
478 |
-
)
|
479 |
|
480 |
# Combine all results into a single DataFrame
|
481 |
leaderboard_data = pd.DataFrame(
|
@@ -496,7 +504,6 @@ def get_leaderboard_data(vote_entry=None):
|
|
496 |
leaderboard_data = leaderboard_data.round(
|
497 |
{
|
498 |
"Elo Score": 2,
|
499 |
-
"Conversation Efficiency Index": 2,
|
500 |
"Average Win Rate": 2,
|
501 |
"Bradley-Terry Coefficient": 2,
|
502 |
"Eigenvector Centrality Value": 2,
|
|
|
419 |
vote_df, conversation_df, on=["timestamp", "left", "right"], how="inner"
|
420 |
)
|
421 |
|
422 |
+
# Create dictionaries to track scores and match counts
|
423 |
+
model_stats = {}
|
|
|
|
|
424 |
|
425 |
# Process each row once and accumulate scores
|
426 |
for _, row in all_df.iterrows():
|
427 |
+
left_model = row["left"]
|
428 |
+
right_model = row["right"]
|
429 |
+
is_self_match = left_model == right_model
|
430 |
+
|
431 |
+
# Initialize dictionaries for models if they don't exist yet
|
432 |
+
for model in [left_model, right_model]:
|
433 |
+
if model not in model_stats:
|
434 |
+
model_stats[model] = {
|
435 |
+
"cei_sum": 0, # Sum of per-round scores
|
436 |
+
"cei_max": 0, # Sum of per-round maximums
|
437 |
+
"self_matches": 0, # Count of self-matches
|
438 |
+
"self_draws": 0 # Count of draws in self-matches
|
439 |
+
}
|
440 |
+
|
441 |
+
# Handle self-matches (same model on both sides)
|
442 |
+
if is_self_match:
|
443 |
+
model_stats[left_model]["self_matches"] += 1
|
444 |
+
if row["winner"] == evalica.Winner.Draw:
|
445 |
+
model_stats[left_model]["self_draws"] += 1
|
446 |
+
continue
|
447 |
+
|
448 |
+
# Determine scores based on winner for competitive matches
|
449 |
match row["winner"]:
|
450 |
case evalica.Winner.X:
|
451 |
+
left_score = 1
|
452 |
+
right_score = -1
|
453 |
case evalica.Winner.Y:
|
454 |
+
left_score = -1
|
455 |
+
right_score = 1
|
456 |
case _: # Draw
|
457 |
left_score = 0.1
|
458 |
right_score = 0.1
|
459 |
+
|
460 |
# Count rounds for each side
|
461 |
left_round = sum(1 for msg in row["left_chat"] if msg["role"] == "assistant")
|
462 |
right_round = sum(1 for msg in row["right_chat"] if msg["role"] == "assistant")
|
463 |
+
|
464 |
+
# Update CEI metrics
|
465 |
+
model_stats[left_model]["cei_max"] += 1 / left_round
|
466 |
+
model_stats[right_model]["cei_max"] += 1 / right_round
|
467 |
+
model_stats[left_model]["cei_sum"] += left_score / left_round
|
468 |
+
model_stats[right_model]["cei_sum"] += right_score / right_round
|
469 |
+
|
470 |
+
# Calculate CEI results
|
471 |
+
cei_result = {}
|
472 |
+
for model in elo_result.scores.index:
|
473 |
+
if model in model_stats and model_stats[model]["cei_max"] > 0:
|
474 |
+
cei_result[model] = round(model_stats[model]["cei_sum"] / model_stats[model]["cei_max"], 2)
|
475 |
+
else:
|
476 |
+
cei_result[model] = "N/A"
|
477 |
+
cei_result = pd.Series(cei_result)
|
478 |
+
|
479 |
+
# Calculate MCS results
|
480 |
+
mcs_result = {}
|
481 |
+
for model in elo_result.scores.index:
|
482 |
+
if model in model_stats and model_stats[model]["self_matches"] > 0:
|
483 |
+
mcs_result[model] = round(model_stats[model]["self_draws"] / model_stats[model]["self_matches"], 2)
|
484 |
+
else:
|
485 |
+
mcs_result[model] = "N/A"
|
486 |
+
mcs_result = pd.Series(mcs_result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
487 |
|
488 |
# Combine all results into a single DataFrame
|
489 |
leaderboard_data = pd.DataFrame(
|
|
|
504 |
leaderboard_data = leaderboard_data.round(
|
505 |
{
|
506 |
"Elo Score": 2,
|
|
|
507 |
"Average Win Rate": 2,
|
508 |
"Bradley-Terry Coefficient": 2,
|
509 |
"Eigenvector Centrality Value": 2,
|