Update app.py
Browse files
app.py
CHANGED
@@ -409,7 +409,7 @@ def get_leaderboard_data(vote_entry=None):
|
|
409 |
pagerank_result = evalica.pagerank(
|
410 |
vote_df["left"], vote_df["right"], vote_df["winner"]
|
411 |
)
|
412 |
-
|
413 |
# Load conversation data from the Hugging Face repository
|
414 |
conversation_data = load_content_from_hf("SE-Arena/conversations")
|
415 |
conversation_df = pd.DataFrame(conversation_data)
|
@@ -418,16 +418,16 @@ def get_leaderboard_data(vote_entry=None):
|
|
418 |
all_df = pd.merge(
|
419 |
vote_df, conversation_df, on=["timestamp", "left", "right"], how="inner"
|
420 |
)
|
421 |
-
|
422 |
# Calculate Conversation Efficiency Indexs more efficiently
|
423 |
# Create a dictionary to store accumulated scores and counts for each model
|
424 |
model_rcs_sum = {}
|
425 |
model_rcs_max = {}
|
426 |
-
|
427 |
# Process each row once and accumulate scores
|
428 |
for _, row in all_df.iterrows():
|
429 |
# Determine scores based on winner
|
430 |
-
match row["winner"]:
|
431 |
case evalica.Winner.X:
|
432 |
left_score = 1.0
|
433 |
right_score = -1.0
|
@@ -437,29 +437,45 @@ def get_leaderboard_data(vote_entry=None):
|
|
437 |
case _: # Draw
|
438 |
left_score = 0.1
|
439 |
right_score = 0.1
|
440 |
-
|
441 |
# Count rounds for each side
|
442 |
left_round = sum(1 for msg in row["left_chat"] if msg["role"] == "assistant")
|
443 |
right_round = sum(1 for msg in row["right_chat"] if msg["role"] == "assistant")
|
444 |
-
|
445 |
left_model = row["left"]
|
446 |
right_model = row["right"]
|
447 |
-
|
448 |
model_rcs_max[left_model] = model_rcs_max.get(left_model, 0) + 1.0 / left_round
|
449 |
-
model_rcs_max[right_model] =
|
450 |
-
|
|
|
|
|
451 |
# Calculate per-round scores
|
452 |
-
model_rcs_sum[left_model] =
|
453 |
-
|
|
|
|
|
|
|
|
|
454 |
|
455 |
-
cei_result = {
|
456 |
-
|
|
|
|
|
|
|
|
|
457 |
|
458 |
self_matches = vote_df[vote_df["left"] == vote_df["right"]]
|
459 |
model_matches = self_matches.groupby("left")
|
460 |
-
draw_counts = model_matches["winner"].apply(
|
|
|
|
|
461 |
total_counts = model_matches.size()
|
462 |
-
mcs_result = (
|
|
|
|
|
|
|
|
|
463 |
|
464 |
# Combine all results into a single DataFrame
|
465 |
leaderboard_data = pd.DataFrame(
|
@@ -491,7 +507,7 @@ def get_leaderboard_data(vote_entry=None):
|
|
491 |
|
492 |
# Add a Rank column based on Elo scores
|
493 |
leaderboard_data["Rank"] = (
|
494 |
-
leaderboard_data["Elo Score"].rank(ascending=False).astype(int)
|
495 |
)
|
496 |
|
497 |
# Place rank in the first column
|
|
|
409 |
pagerank_result = evalica.pagerank(
|
410 |
vote_df["left"], vote_df["right"], vote_df["winner"]
|
411 |
)
|
412 |
+
|
413 |
# Load conversation data from the Hugging Face repository
|
414 |
conversation_data = load_content_from_hf("SE-Arena/conversations")
|
415 |
conversation_df = pd.DataFrame(conversation_data)
|
|
|
418 |
all_df = pd.merge(
|
419 |
vote_df, conversation_df, on=["timestamp", "left", "right"], how="inner"
|
420 |
)
|
421 |
+
|
422 |
# Calculate Conversation Efficiency Indexs more efficiently
|
423 |
# Create a dictionary to store accumulated scores and counts for each model
|
424 |
model_rcs_sum = {}
|
425 |
model_rcs_max = {}
|
426 |
+
|
427 |
# Process each row once and accumulate scores
|
428 |
for _, row in all_df.iterrows():
|
429 |
# Determine scores based on winner
|
430 |
+
match row["winner"]:
|
431 |
case evalica.Winner.X:
|
432 |
left_score = 1.0
|
433 |
right_score = -1.0
|
|
|
437 |
case _: # Draw
|
438 |
left_score = 0.1
|
439 |
right_score = 0.1
|
440 |
+
|
441 |
# Count rounds for each side
|
442 |
left_round = sum(1 for msg in row["left_chat"] if msg["role"] == "assistant")
|
443 |
right_round = sum(1 for msg in row["right_chat"] if msg["role"] == "assistant")
|
444 |
+
|
445 |
left_model = row["left"]
|
446 |
right_model = row["right"]
|
447 |
+
|
448 |
model_rcs_max[left_model] = model_rcs_max.get(left_model, 0) + 1.0 / left_round
|
449 |
+
model_rcs_max[right_model] = (
|
450 |
+
model_rcs_max.get(right_model, 0) + 1.0 / right_round
|
451 |
+
)
|
452 |
+
|
453 |
# Calculate per-round scores
|
454 |
+
model_rcs_sum[left_model] = (
|
455 |
+
model_rcs_sum.get(left_model, 0) + left_score / left_round
|
456 |
+
)
|
457 |
+
model_rcs_sum[right_model] = (
|
458 |
+
model_rcs_sum.get(right_model, 0) + right_score / right_round
|
459 |
+
)
|
460 |
|
461 |
+
cei_result = {
|
462 |
+
model: model_rcs_sum[model] / model_rcs_max[model] for model in model_rcs_sum
|
463 |
+
}
|
464 |
+
cei_result = pd.Series(
|
465 |
+
{model: cei_result[model] for model in elo_result.scores.index}
|
466 |
+
)
|
467 |
|
468 |
self_matches = vote_df[vote_df["left"] == vote_df["right"]]
|
469 |
model_matches = self_matches.groupby("left")
|
470 |
+
draw_counts = model_matches["winner"].apply(
|
471 |
+
lambda x: (x == evalica.Winner.Draw).sum()
|
472 |
+
)
|
473 |
total_counts = model_matches.size()
|
474 |
+
mcs_result = (
|
475 |
+
(draw_counts / total_counts)
|
476 |
+
.round(2)
|
477 |
+
.reindex(elo_result.scores.index, fill_value="N/A")
|
478 |
+
)
|
479 |
|
480 |
# Combine all results into a single DataFrame
|
481 |
leaderboard_data = pd.DataFrame(
|
|
|
507 |
|
508 |
# Add a Rank column based on Elo scores
|
509 |
leaderboard_data["Rank"] = (
|
510 |
+
leaderboard_data["Elo Score"].rank(method="min", ascending=False).astype(int)
|
511 |
)
|
512 |
|
513 |
# Place rank in the first column
|