zhiminy commited on
Commit
7735526
·
1 Parent(s): 1da2cba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -46
app.py CHANGED
@@ -419,63 +419,71 @@ def get_leaderboard_data(vote_entry=None):
419
  vote_df, conversation_df, on=["timestamp", "left", "right"], how="inner"
420
  )
421
 
422
- # Calculate Conversation Efficiency Indexs more efficiently
423
- # Create a dictionary to store accumulated scores and counts for each model
424
- model_rcs_sum = {}
425
- model_rcs_max = {}
426
 
427
  # Process each row once and accumulate scores
428
  for _, row in all_df.iterrows():
429
- # Determine scores based on winner
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
  match row["winner"]:
431
  case evalica.Winner.X:
432
- left_score = 1.0
433
- right_score = -1.0
434
  case evalica.Winner.Y:
435
- left_score = -1.0
436
- right_score = 1.0
437
  case _: # Draw
438
  left_score = 0.1
439
  right_score = 0.1
440
-
441
  # Count rounds for each side
442
  left_round = sum(1 for msg in row["left_chat"] if msg["role"] == "assistant")
443
  right_round = sum(1 for msg in row["right_chat"] if msg["role"] == "assistant")
444
-
445
- left_model = row["left"]
446
- right_model = row["right"]
447
-
448
- model_rcs_max[left_model] = model_rcs_max.get(left_model, 0) + 1.0 / left_round
449
- model_rcs_max[right_model] = (
450
- model_rcs_max.get(right_model, 0) + 1.0 / right_round
451
- )
452
-
453
- # Calculate per-round scores
454
- model_rcs_sum[left_model] = (
455
- model_rcs_sum.get(left_model, 0) + left_score / left_round
456
- )
457
- model_rcs_sum[right_model] = (
458
- model_rcs_sum.get(right_model, 0) + right_score / right_round
459
- )
460
-
461
- cei_result = {
462
- model: model_rcs_sum[model] / model_rcs_max[model] for model in model_rcs_sum
463
- }
464
- cei_result = pd.Series(
465
- {model: cei_result[model] for model in elo_result.scores.index}
466
- )
467
-
468
- self_matches = vote_df[vote_df["left"] == vote_df["right"]]
469
- model_matches = self_matches.groupby("left")
470
- draw_counts = model_matches["winner"].apply(
471
- lambda x: (x == evalica.Winner.Draw).sum()
472
- )
473
- total_counts = model_matches.size()
474
- mcs_result = (
475
- (draw_counts / total_counts)
476
- .round(2)
477
- .reindex(elo_result.scores.index, fill_value="N/A")
478
- )
479
 
480
  # Combine all results into a single DataFrame
481
  leaderboard_data = pd.DataFrame(
@@ -496,7 +504,6 @@ def get_leaderboard_data(vote_entry=None):
496
  leaderboard_data = leaderboard_data.round(
497
  {
498
  "Elo Score": 2,
499
- "Conversation Efficiency Index": 2,
500
  "Average Win Rate": 2,
501
  "Bradley-Terry Coefficient": 2,
502
  "Eigenvector Centrality Value": 2,
 
419
  vote_df, conversation_df, on=["timestamp", "left", "right"], how="inner"
420
  )
421
 
422
+ # Create dictionaries to track scores and match counts
423
+ model_stats = {}
 
 
424
 
425
  # Process each row once and accumulate scores
426
  for _, row in all_df.iterrows():
427
+ left_model = row["left"]
428
+ right_model = row["right"]
429
+ is_self_match = left_model == right_model
430
+
431
+ # Initialize dictionaries for models if they don't exist yet
432
+ for model in [left_model, right_model]:
433
+ if model not in model_stats:
434
+ model_stats[model] = {
435
+ "cei_sum": 0, # Sum of per-round scores
436
+ "cei_max": 0, # Sum of per-round maximums
437
+ "self_matches": 0, # Count of self-matches
438
+ "self_draws": 0 # Count of draws in self-matches
439
+ }
440
+
441
+ # Handle self-matches (same model on both sides)
442
+ if is_self_match:
443
+ model_stats[left_model]["self_matches"] += 1
444
+ if row["winner"] == evalica.Winner.Draw:
445
+ model_stats[left_model]["self_draws"] += 1
446
+ continue
447
+
448
+ # Determine scores based on winner for competitive matches
449
  match row["winner"]:
450
  case evalica.Winner.X:
451
+ left_score = 1
452
+ right_score = -1
453
  case evalica.Winner.Y:
454
+ left_score = -1
455
+ right_score = 1
456
  case _: # Draw
457
  left_score = 0.1
458
  right_score = 0.1
459
+
460
  # Count rounds for each side
461
  left_round = sum(1 for msg in row["left_chat"] if msg["role"] == "assistant")
462
  right_round = sum(1 for msg in row["right_chat"] if msg["role"] == "assistant")
463
+
464
+ # Update CEI metrics
465
+ model_stats[left_model]["cei_max"] += 1 / left_round
466
+ model_stats[right_model]["cei_max"] += 1 / right_round
467
+ model_stats[left_model]["cei_sum"] += left_score / left_round
468
+ model_stats[right_model]["cei_sum"] += right_score / right_round
469
+
470
+ # Calculate CEI results
471
+ cei_result = {}
472
+ for model in elo_result.scores.index:
473
+ if model in model_stats and model_stats[model]["cei_max"] > 0:
474
+ cei_result[model] = round(model_stats[model]["cei_sum"] / model_stats[model]["cei_max"], 2)
475
+ else:
476
+ cei_result[model] = "N/A"
477
+ cei_result = pd.Series(cei_result)
478
+
479
+ # Calculate MCS results
480
+ mcs_result = {}
481
+ for model in elo_result.scores.index:
482
+ if model in model_stats and model_stats[model]["self_matches"] > 0:
483
+ mcs_result[model] = round(model_stats[model]["self_draws"] / model_stats[model]["self_matches"], 2)
484
+ else:
485
+ mcs_result[model] = "N/A"
486
+ mcs_result = pd.Series(mcs_result)
 
 
 
 
 
 
 
 
 
 
 
487
 
488
  # Combine all results into a single DataFrame
489
  leaderboard_data = pd.DataFrame(
 
504
  leaderboard_data = leaderboard_data.round(
505
  {
506
  "Elo Score": 2,
 
507
  "Average Win Rate": 2,
508
  "Bradley-Terry Coefficient": 2,
509
  "Eigenvector Centrality Value": 2,