add bothbad state
Browse files
app.py
CHANGED
@@ -1,3 +1,7 @@
|
|
|
|
|
|
|
|
|
|
1 |
import dotenv
|
2 |
import evalica
|
3 |
import gitlab
|
@@ -380,6 +384,7 @@ def get_leaderboard_data(vote_entry=None):
|
|
380 |
"Conversation Efficiency Index",
|
381 |
"Model Consistency Score",
|
382 |
"Average Win Rate",
|
|
|
383 |
"Bradley-Terry Coefficient",
|
384 |
"Eigenvector Centrality Value",
|
385 |
"Newman Modularity Score",
|
@@ -387,29 +392,6 @@ def get_leaderboard_data(vote_entry=None):
|
|
387 |
]
|
388 |
)
|
389 |
|
390 |
-
# map vote to winner
|
391 |
-
vote_df["winner"] = vote_df["winner"].map(
|
392 |
-
{
|
393 |
-
"left": evalica.Winner.X,
|
394 |
-
"right": evalica.Winner.Y,
|
395 |
-
"tie": evalica.Winner.Draw,
|
396 |
-
}
|
397 |
-
)
|
398 |
-
|
399 |
-
# Calculate scores using various metrics
|
400 |
-
avr_result = evalica.average_win_rate(
|
401 |
-
vote_df["left"], vote_df["right"], vote_df["winner"]
|
402 |
-
)
|
403 |
-
bt_result = evalica.bradley_terry(
|
404 |
-
vote_df["left"], vote_df["right"], vote_df["winner"]
|
405 |
-
)
|
406 |
-
newman_result = evalica.newman(vote_df["left"], vote_df["right"], vote_df["winner"])
|
407 |
-
eigen_result = evalica.eigen(vote_df["left"], vote_df["right"], vote_df["winner"])
|
408 |
-
elo_result = evalica.elo(vote_df["left"], vote_df["right"], vote_df["winner"])
|
409 |
-
pagerank_result = evalica.pagerank(
|
410 |
-
vote_df["left"], vote_df["right"], vote_df["winner"]
|
411 |
-
)
|
412 |
-
|
413 |
# Load conversation data from the Hugging Face repository
|
414 |
conversation_data = load_content_from_hf("SE-Arena/conversations")
|
415 |
conversation_df = pd.DataFrame(conversation_data)
|
@@ -427,51 +409,89 @@ def get_leaderboard_data(vote_entry=None):
|
|
427 |
left_model = row["left"]
|
428 |
right_model = row["right"]
|
429 |
is_self_match = left_model == right_model
|
430 |
-
|
431 |
# Initialize dictionaries for models if they don't exist yet
|
432 |
for model in [left_model, right_model]:
|
433 |
if model not in model_stats:
|
434 |
model_stats[model] = {
|
435 |
-
"cei_sum": 0,
|
436 |
-
"cei_max": 0,
|
437 |
-
"self_matches": 0,
|
438 |
-
"self_draws": 0
|
439 |
}
|
440 |
-
|
441 |
# Handle self-matches (same model on both sides)
|
442 |
if is_self_match:
|
443 |
model_stats[left_model]["self_matches"] += 1
|
444 |
-
if row["winner"] ==
|
445 |
model_stats[left_model]["self_draws"] += 1
|
446 |
continue
|
447 |
-
|
448 |
# Determine scores based on winner for competitive matches
|
449 |
match row["winner"]:
|
450 |
-
case
|
451 |
left_score = 1
|
452 |
right_score = -1
|
453 |
-
case
|
454 |
left_score = -1
|
455 |
right_score = 1
|
456 |
-
case
|
457 |
-
left_score = 0.
|
458 |
-
right_score = 0.
|
459 |
-
|
|
|
|
|
|
|
460 |
# Count rounds for each side
|
461 |
left_round = sum(1 for msg in row["left_chat"] if msg["role"] == "assistant")
|
462 |
right_round = sum(1 for msg in row["right_chat"] if msg["role"] == "assistant")
|
463 |
-
|
464 |
# Update CEI metrics
|
465 |
model_stats[left_model]["cei_max"] += 1 / left_round
|
466 |
model_stats[right_model]["cei_max"] += 1 / right_round
|
467 |
model_stats[left_model]["cei_sum"] += left_score / left_round
|
468 |
model_stats[right_model]["cei_sum"] += right_score / right_round
|
469 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
470 |
# Calculate CEI results
|
471 |
cei_result = {}
|
472 |
for model in elo_result.scores.index:
|
473 |
if model in model_stats and model_stats[model]["cei_max"] > 0:
|
474 |
-
cei_result[model] = round(
|
|
|
|
|
475 |
else:
|
476 |
cei_result[model] = "N/A"
|
477 |
cei_result = pd.Series(cei_result)
|
@@ -480,7 +500,9 @@ def get_leaderboard_data(vote_entry=None):
|
|
480 |
mcs_result = {}
|
481 |
for model in elo_result.scores.index:
|
482 |
if model in model_stats and model_stats[model]["self_matches"] > 0:
|
483 |
-
mcs_result[model] = round(
|
|
|
|
|
484 |
else:
|
485 |
mcs_result[model] = "N/A"
|
486 |
mcs_result = pd.Series(mcs_result)
|
@@ -934,10 +956,10 @@ with gr.Blocks() as app:
|
|
934 |
# Feedback panel, initially hidden
|
935 |
with gr.Row(visible=False) as vote_panel:
|
936 |
feedback = gr.Radio(
|
937 |
-
choices=["Model A", "Model B", "
|
938 |
label="Which model do you prefer?",
|
939 |
-
value="
|
940 |
-
interactive=False,
|
941 |
)
|
942 |
submit_feedback_btn = gr.Button("Submit Feedback", interactive=False)
|
943 |
|
@@ -1160,8 +1182,10 @@ with gr.Blocks() as app:
|
|
1160 |
winner_model = "left"
|
1161 |
case "Model B":
|
1162 |
winner_model = "right"
|
1163 |
-
case "
|
1164 |
winner_model = "tie"
|
|
|
|
|
1165 |
|
1166 |
# Create feedback entry
|
1167 |
vote_entry = {
|
@@ -1220,7 +1244,7 @@ with gr.Blocks() as app:
|
|
1220 |
value="Submit", interactive=True, visible=True
|
1221 |
), # [9] Reset send_first button
|
1222 |
gr.update(
|
1223 |
-
value="
|
1224 |
), # [10] Reset feedback radio selection
|
1225 |
get_leaderboard_data(vote_entry), # [11] Updated leaderboard data
|
1226 |
gr.update(
|
|
|
1 |
+
# References for model evaluation metrics:
|
2 |
+
# - Chatbot Arena: https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH
|
3 |
+
# - Evalica: https://github.com/dustalov/evalica/blob/master/Chatbot-Arena.ipynb
|
4 |
+
|
5 |
import dotenv
|
6 |
import evalica
|
7 |
import gitlab
|
|
|
384 |
"Conversation Efficiency Index",
|
385 |
"Model Consistency Score",
|
386 |
"Average Win Rate",
|
387 |
+
"Average Failure Rate",
|
388 |
"Bradley-Terry Coefficient",
|
389 |
"Eigenvector Centrality Value",
|
390 |
"Newman Modularity Score",
|
|
|
392 |
]
|
393 |
)
|
394 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
395 |
# Load conversation data from the Hugging Face repository
|
396 |
conversation_data = load_content_from_hf("SE-Arena/conversations")
|
397 |
conversation_df = pd.DataFrame(conversation_data)
|
|
|
409 |
left_model = row["left"]
|
410 |
right_model = row["right"]
|
411 |
is_self_match = left_model == right_model
|
412 |
+
|
413 |
# Initialize dictionaries for models if they don't exist yet
|
414 |
for model in [left_model, right_model]:
|
415 |
if model not in model_stats:
|
416 |
model_stats[model] = {
|
417 |
+
"cei_sum": 0, # Sum of per-round scores
|
418 |
+
"cei_max": 0, # Sum of per-round maximums
|
419 |
+
"self_matches": 0, # Count of self-matches
|
420 |
+
"self_draws": 0, # Count of draws in self-matches
|
421 |
}
|
422 |
+
|
423 |
# Handle self-matches (same model on both sides)
|
424 |
if is_self_match:
|
425 |
model_stats[left_model]["self_matches"] += 1
|
426 |
+
if row["winner"] == "both_bad" or row["winner"] == "tie":
|
427 |
model_stats[left_model]["self_draws"] += 1
|
428 |
continue
|
429 |
+
|
430 |
# Determine scores based on winner for competitive matches
|
431 |
match row["winner"]:
|
432 |
+
case "left":
|
433 |
left_score = 1
|
434 |
right_score = -1
|
435 |
+
case "right":
|
436 |
left_score = -1
|
437 |
right_score = 1
|
438 |
+
case "tie":
|
439 |
+
left_score = 0.3
|
440 |
+
right_score = 0.3
|
441 |
+
case "both_bad":
|
442 |
+
left_score = -0.3
|
443 |
+
right_score = -0.3
|
444 |
+
|
445 |
# Count rounds for each side
|
446 |
left_round = sum(1 for msg in row["left_chat"] if msg["role"] == "assistant")
|
447 |
right_round = sum(1 for msg in row["right_chat"] if msg["role"] == "assistant")
|
448 |
+
|
449 |
# Update CEI metrics
|
450 |
model_stats[left_model]["cei_max"] += 1 / left_round
|
451 |
model_stats[right_model]["cei_max"] += 1 / right_round
|
452 |
model_stats[left_model]["cei_sum"] += left_score / left_round
|
453 |
model_stats[right_model]["cei_sum"] += right_score / right_round
|
454 |
|
455 |
+
# map vote to winner
|
456 |
+
vote_df["winner"] = vote_df["winner"].map(
|
457 |
+
{
|
458 |
+
"left": evalica.Winner.X,
|
459 |
+
"right": evalica.Winner.Y,
|
460 |
+
"tie": evalica.Winner.Draw,
|
461 |
+
"both_bad": evalica.Winner.Draw,
|
462 |
+
}
|
463 |
+
)
|
464 |
+
|
465 |
+
# Calculate scores using various metrics
|
466 |
+
avr_result = evalica.average_win_rate(
|
467 |
+
vote_df["left"],
|
468 |
+
vote_df["right"],
|
469 |
+
vote_df["winner"],
|
470 |
+
tie_weight=0, # Chatbot Arena excludes ties
|
471 |
+
)
|
472 |
+
bt_result = evalica.bradley_terry(
|
473 |
+
vote_df["left"], vote_df["right"], vote_df["winner"], tie_weight=0
|
474 |
+
)
|
475 |
+
newman_result = evalica.newman(
|
476 |
+
vote_df["left"], vote_df["right"], vote_df["winner"], tie_weight=0
|
477 |
+
)
|
478 |
+
eigen_result = evalica.eigen(
|
479 |
+
vote_df["left"], vote_df["right"], vote_df["winner"], tie_weight=0
|
480 |
+
)
|
481 |
+
elo_result = evalica.elo(
|
482 |
+
vote_df["left"], vote_df["right"], vote_df["winner"], tie_weight=0
|
483 |
+
)
|
484 |
+
pagerank_result = evalica.pagerank(
|
485 |
+
vote_df["left"], vote_df["right"], vote_df["winner"], tie_weight=0
|
486 |
+
)
|
487 |
+
|
488 |
# Calculate CEI results
|
489 |
cei_result = {}
|
490 |
for model in elo_result.scores.index:
|
491 |
if model in model_stats and model_stats[model]["cei_max"] > 0:
|
492 |
+
cei_result[model] = round(
|
493 |
+
model_stats[model]["cei_sum"] / model_stats[model]["cei_max"], 2
|
494 |
+
)
|
495 |
else:
|
496 |
cei_result[model] = "N/A"
|
497 |
cei_result = pd.Series(cei_result)
|
|
|
500 |
mcs_result = {}
|
501 |
for model in elo_result.scores.index:
|
502 |
if model in model_stats and model_stats[model]["self_matches"] > 0:
|
503 |
+
mcs_result[model] = round(
|
504 |
+
model_stats[model]["self_draws"] / model_stats[model]["self_matches"], 2
|
505 |
+
)
|
506 |
else:
|
507 |
mcs_result[model] = "N/A"
|
508 |
mcs_result = pd.Series(mcs_result)
|
|
|
956 |
# Feedback panel, initially hidden
|
957 |
with gr.Row(visible=False) as vote_panel:
|
958 |
feedback = gr.Radio(
|
959 |
+
choices=["Model A", "Model B", "Tie", "Tie (Both Bad)"],
|
960 |
label="Which model do you prefer?",
|
961 |
+
value="Tie",
|
962 |
+
interactive=False,
|
963 |
)
|
964 |
submit_feedback_btn = gr.Button("Submit Feedback", interactive=False)
|
965 |
|
|
|
1182 |
winner_model = "left"
|
1183 |
case "Model B":
|
1184 |
winner_model = "right"
|
1185 |
+
case "Tie":
|
1186 |
winner_model = "tie"
|
1187 |
+
case _:
|
1188 |
+
winner_model = "both_bad"
|
1189 |
|
1190 |
# Create feedback entry
|
1191 |
vote_entry = {
|
|
|
1244 |
value="Submit", interactive=True, visible=True
|
1245 |
), # [9] Reset send_first button
|
1246 |
gr.update(
|
1247 |
+
value="Tie", interactive=True
|
1248 |
), # [10] Reset feedback radio selection
|
1249 |
get_leaderboard_data(vote_entry), # [11] Updated leaderboard data
|
1250 |
gr.update(
|