zhiminy commited on
Commit
686ad29
·
1 Parent(s): 7735526

add bothbad state

Browse files
Files changed (1) hide show
  1. app.py +69 -45
app.py CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  import dotenv
2
  import evalica
3
  import gitlab
@@ -380,6 +384,7 @@ def get_leaderboard_data(vote_entry=None):
380
  "Conversation Efficiency Index",
381
  "Model Consistency Score",
382
  "Average Win Rate",
 
383
  "Bradley-Terry Coefficient",
384
  "Eigenvector Centrality Value",
385
  "Newman Modularity Score",
@@ -387,29 +392,6 @@ def get_leaderboard_data(vote_entry=None):
387
  ]
388
  )
389
 
390
- # map vote to winner
391
- vote_df["winner"] = vote_df["winner"].map(
392
- {
393
- "left": evalica.Winner.X,
394
- "right": evalica.Winner.Y,
395
- "tie": evalica.Winner.Draw,
396
- }
397
- )
398
-
399
- # Calculate scores using various metrics
400
- avr_result = evalica.average_win_rate(
401
- vote_df["left"], vote_df["right"], vote_df["winner"]
402
- )
403
- bt_result = evalica.bradley_terry(
404
- vote_df["left"], vote_df["right"], vote_df["winner"]
405
- )
406
- newman_result = evalica.newman(vote_df["left"], vote_df["right"], vote_df["winner"])
407
- eigen_result = evalica.eigen(vote_df["left"], vote_df["right"], vote_df["winner"])
408
- elo_result = evalica.elo(vote_df["left"], vote_df["right"], vote_df["winner"])
409
- pagerank_result = evalica.pagerank(
410
- vote_df["left"], vote_df["right"], vote_df["winner"]
411
- )
412
-
413
  # Load conversation data from the Hugging Face repository
414
  conversation_data = load_content_from_hf("SE-Arena/conversations")
415
  conversation_df = pd.DataFrame(conversation_data)
@@ -427,51 +409,89 @@ def get_leaderboard_data(vote_entry=None):
427
  left_model = row["left"]
428
  right_model = row["right"]
429
  is_self_match = left_model == right_model
430
-
431
  # Initialize dictionaries for models if they don't exist yet
432
  for model in [left_model, right_model]:
433
  if model not in model_stats:
434
  model_stats[model] = {
435
- "cei_sum": 0, # Sum of per-round scores
436
- "cei_max": 0, # Sum of per-round maximums
437
- "self_matches": 0, # Count of self-matches
438
- "self_draws": 0 # Count of draws in self-matches
439
  }
440
-
441
  # Handle self-matches (same model on both sides)
442
  if is_self_match:
443
  model_stats[left_model]["self_matches"] += 1
444
- if row["winner"] == evalica.Winner.Draw:
445
  model_stats[left_model]["self_draws"] += 1
446
  continue
447
-
448
  # Determine scores based on winner for competitive matches
449
  match row["winner"]:
450
- case evalica.Winner.X:
451
  left_score = 1
452
  right_score = -1
453
- case evalica.Winner.Y:
454
  left_score = -1
455
  right_score = 1
456
- case _: # Draw
457
- left_score = 0.1
458
- right_score = 0.1
459
-
 
 
 
460
  # Count rounds for each side
461
  left_round = sum(1 for msg in row["left_chat"] if msg["role"] == "assistant")
462
  right_round = sum(1 for msg in row["right_chat"] if msg["role"] == "assistant")
463
-
464
  # Update CEI metrics
465
  model_stats[left_model]["cei_max"] += 1 / left_round
466
  model_stats[right_model]["cei_max"] += 1 / right_round
467
  model_stats[left_model]["cei_sum"] += left_score / left_round
468
  model_stats[right_model]["cei_sum"] += right_score / right_round
469
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
  # Calculate CEI results
471
  cei_result = {}
472
  for model in elo_result.scores.index:
473
  if model in model_stats and model_stats[model]["cei_max"] > 0:
474
- cei_result[model] = round(model_stats[model]["cei_sum"] / model_stats[model]["cei_max"], 2)
 
 
475
  else:
476
  cei_result[model] = "N/A"
477
  cei_result = pd.Series(cei_result)
@@ -480,7 +500,9 @@ def get_leaderboard_data(vote_entry=None):
480
  mcs_result = {}
481
  for model in elo_result.scores.index:
482
  if model in model_stats and model_stats[model]["self_matches"] > 0:
483
- mcs_result[model] = round(model_stats[model]["self_draws"] / model_stats[model]["self_matches"], 2)
 
 
484
  else:
485
  mcs_result[model] = "N/A"
486
  mcs_result = pd.Series(mcs_result)
@@ -934,10 +956,10 @@ with gr.Blocks() as app:
934
  # Feedback panel, initially hidden
935
  with gr.Row(visible=False) as vote_panel:
936
  feedback = gr.Radio(
937
- choices=["Model A", "Model B", "Can't Decide"],
938
  label="Which model do you prefer?",
939
- value="Can't Decide",
940
- interactive=False, # Initially not interactive
941
  )
942
  submit_feedback_btn = gr.Button("Submit Feedback", interactive=False)
943
 
@@ -1160,8 +1182,10 @@ with gr.Blocks() as app:
1160
  winner_model = "left"
1161
  case "Model B":
1162
  winner_model = "right"
1163
- case "Can't Decide":
1164
  winner_model = "tie"
 
 
1165
 
1166
  # Create feedback entry
1167
  vote_entry = {
@@ -1220,7 +1244,7 @@ with gr.Blocks() as app:
1220
  value="Submit", interactive=True, visible=True
1221
  ), # [9] Reset send_first button
1222
  gr.update(
1223
- value="Can't Decide", interactive=True
1224
  ), # [10] Reset feedback radio selection
1225
  get_leaderboard_data(vote_entry), # [11] Updated leaderboard data
1226
  gr.update(
 
1
+ # References for model evaluation metrics:
2
+ # - Chatbot Arena: https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH
3
+ # - Evalica: https://github.com/dustalov/evalica/blob/master/Chatbot-Arena.ipynb
4
+
5
  import dotenv
6
  import evalica
7
  import gitlab
 
384
  "Conversation Efficiency Index",
385
  "Model Consistency Score",
386
  "Average Win Rate",
387
+ "Average Failure Rate",
388
  "Bradley-Terry Coefficient",
389
  "Eigenvector Centrality Value",
390
  "Newman Modularity Score",
 
392
  ]
393
  )
394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
  # Load conversation data from the Hugging Face repository
396
  conversation_data = load_content_from_hf("SE-Arena/conversations")
397
  conversation_df = pd.DataFrame(conversation_data)
 
409
  left_model = row["left"]
410
  right_model = row["right"]
411
  is_self_match = left_model == right_model
412
+
413
  # Initialize dictionaries for models if they don't exist yet
414
  for model in [left_model, right_model]:
415
  if model not in model_stats:
416
  model_stats[model] = {
417
+ "cei_sum": 0, # Sum of per-round scores
418
+ "cei_max": 0, # Sum of per-round maximums
419
+ "self_matches": 0, # Count of self-matches
420
+ "self_draws": 0, # Count of draws in self-matches
421
  }
422
+
423
  # Handle self-matches (same model on both sides)
424
  if is_self_match:
425
  model_stats[left_model]["self_matches"] += 1
426
+ if row["winner"] == "both_bad" or row["winner"] == "tie":
427
  model_stats[left_model]["self_draws"] += 1
428
  continue
429
+
430
  # Determine scores based on winner for competitive matches
431
  match row["winner"]:
432
+ case "left":
433
  left_score = 1
434
  right_score = -1
435
+ case "right":
436
  left_score = -1
437
  right_score = 1
438
+ case "tie":
439
+ left_score = 0.3
440
+ right_score = 0.3
441
+ case "both_bad":
442
+ left_score = -0.3
443
+ right_score = -0.3
444
+
445
  # Count rounds for each side
446
  left_round = sum(1 for msg in row["left_chat"] if msg["role"] == "assistant")
447
  right_round = sum(1 for msg in row["right_chat"] if msg["role"] == "assistant")
448
+
449
  # Update CEI metrics
450
  model_stats[left_model]["cei_max"] += 1 / left_round
451
  model_stats[right_model]["cei_max"] += 1 / right_round
452
  model_stats[left_model]["cei_sum"] += left_score / left_round
453
  model_stats[right_model]["cei_sum"] += right_score / right_round
454
 
455
+ # map vote to winner
456
+ vote_df["winner"] = vote_df["winner"].map(
457
+ {
458
+ "left": evalica.Winner.X,
459
+ "right": evalica.Winner.Y,
460
+ "tie": evalica.Winner.Draw,
461
+ "both_bad": evalica.Winner.Draw,
462
+ }
463
+ )
464
+
465
+ # Calculate scores using various metrics
466
+ avr_result = evalica.average_win_rate(
467
+ vote_df["left"],
468
+ vote_df["right"],
469
+ vote_df["winner"],
470
+ tie_weight=0, # Chatbot Arena excludes ties
471
+ )
472
+ bt_result = evalica.bradley_terry(
473
+ vote_df["left"], vote_df["right"], vote_df["winner"], tie_weight=0
474
+ )
475
+ newman_result = evalica.newman(
476
+ vote_df["left"], vote_df["right"], vote_df["winner"], tie_weight=0
477
+ )
478
+ eigen_result = evalica.eigen(
479
+ vote_df["left"], vote_df["right"], vote_df["winner"], tie_weight=0
480
+ )
481
+ elo_result = evalica.elo(
482
+ vote_df["left"], vote_df["right"], vote_df["winner"], tie_weight=0
483
+ )
484
+ pagerank_result = evalica.pagerank(
485
+ vote_df["left"], vote_df["right"], vote_df["winner"], tie_weight=0
486
+ )
487
+
488
  # Calculate CEI results
489
  cei_result = {}
490
  for model in elo_result.scores.index:
491
  if model in model_stats and model_stats[model]["cei_max"] > 0:
492
+ cei_result[model] = round(
493
+ model_stats[model]["cei_sum"] / model_stats[model]["cei_max"], 2
494
+ )
495
  else:
496
  cei_result[model] = "N/A"
497
  cei_result = pd.Series(cei_result)
 
500
  mcs_result = {}
501
  for model in elo_result.scores.index:
502
  if model in model_stats and model_stats[model]["self_matches"] > 0:
503
+ mcs_result[model] = round(
504
+ model_stats[model]["self_draws"] / model_stats[model]["self_matches"], 2
505
+ )
506
  else:
507
  mcs_result[model] = "N/A"
508
  mcs_result = pd.Series(mcs_result)
 
956
  # Feedback panel, initially hidden
957
  with gr.Row(visible=False) as vote_panel:
958
  feedback = gr.Radio(
959
+ choices=["Model A", "Model B", "Tie", "Tie (Both Bad)"],
960
  label="Which model do you prefer?",
961
+ value="Tie",
962
+ interactive=False,
963
  )
964
  submit_feedback_btn = gr.Button("Submit Feedback", interactive=False)
965
 
 
1182
  winner_model = "left"
1183
  case "Model B":
1184
  winner_model = "right"
1185
+ case "Tie":
1186
  winner_model = "tie"
1187
+ case _:
1188
+ winner_model = "both_bad"
1189
 
1190
  # Create feedback entry
1191
  vote_entry = {
 
1244
  value="Submit", interactive=True, visible=True
1245
  ), # [9] Reset send_first button
1246
  gr.update(
1247
+ value="Tie", interactive=True
1248
  ), # [10] Reset feedback radio selection
1249
  get_leaderboard_data(vote_entry), # [11] Updated leaderboard data
1250
  gr.update(