zhiminy commited on
Commit
deaae24
·
1 Parent(s): 17322b6

add Conversation Efficiency Index

Browse files
Files changed (1) hide show
  1. app.py +81 -57
app.py CHANGED
@@ -296,12 +296,12 @@ def format_conversation_history(conversation_history):
296
  return formatted_text
297
 
298
 
299
- def save_content_to_hf(feedback_data, repo_name, folder_name, file_name):
300
  """
301
  Save feedback content to Hugging Face repository organized by quarter.
302
  """
303
  # Serialize the content to JSON and encode it as bytes
304
- json_content = json.dumps(feedback_data, indent=4).encode("utf-8")
305
 
306
  # Create a binary file-like object
307
  file_like_object = io.BytesIO(json_content)
@@ -334,7 +334,7 @@ def load_content_from_hf(repo_name="SE-Arena/votes"):
334
  Returns:
335
  list: Aggregated feedback data read from the repository.
336
  """
337
- feedback_data = []
338
 
339
  # Get the current year and quarter
340
  now = datetime.now()
@@ -354,35 +354,31 @@ def load_content_from_hf(repo_name="SE-Arena/votes"):
354
  )
355
  with open(local_path, "r") as f:
356
  data = json.load(f)
357
- feedback_data.append(data)
358
- return feedback_data
 
359
 
360
  except:
361
  raise Exception("Error loading feedback data from Hugging Face repository.")
362
 
363
 
364
- def get_leaderboard_data(feedback_entry=None):
365
  # Load feedback data from the Hugging Face repository
366
- feedback_data = load_content_from_hf()
367
- feedback_df = pd.DataFrame(feedback_data)
368
-
369
- # Load conversation data from the Hugging Face repository
370
- conversation_data = load_content_from_hf("SE-Arena/conversations")
371
- conversation_df = pd.DataFrame(conversation_data)
372
 
373
  # Concatenate the new feedback with the existing leaderboard data
374
- if feedback_entry is not None:
375
- feedback_df = pd.concat(
376
- [feedback_df, pd.DataFrame([feedback_entry])], ignore_index=True
377
- )
378
 
379
- if feedback_df.empty:
380
  return pd.DataFrame(
381
  columns=[
382
  "Rank",
383
  "Model",
384
  "Elo Score",
385
- "Consistency Score",
 
386
  "Average Win Rate",
387
  "Bradley-Terry Coefficient",
388
  "Eigenvector Centrality Value",
@@ -392,7 +388,7 @@ def get_leaderboard_data(feedback_entry=None):
392
  )
393
 
394
  # map vote to winner
395
- feedback_df["winner"] = feedback_df["winner"].map(
396
  {
397
  "left": evalica.Winner.X,
398
  "right": evalica.Winner.Y,
@@ -402,51 +398,76 @@ def get_leaderboard_data(feedback_entry=None):
402
 
403
  # Calculate scores using various metrics
404
  avr_result = evalica.average_win_rate(
405
- feedback_df["left"], feedback_df["right"], feedback_df["winner"]
406
  )
407
  bt_result = evalica.bradley_terry(
408
- feedback_df["left"], feedback_df["right"], feedback_df["winner"]
409
- )
410
- newman_result = evalica.newman(
411
- feedback_df["left"], feedback_df["right"], feedback_df["winner"]
412
- )
413
- eigen_result = evalica.eigen(
414
- feedback_df["left"], feedback_df["right"], feedback_df["winner"]
415
- )
416
- elo_result = evalica.elo(
417
- feedback_df["left"], feedback_df["right"], feedback_df["winner"]
418
  )
 
 
 
419
  pagerank_result = evalica.pagerank(
420
- feedback_df["left"], feedback_df["right"], feedback_df["winner"]
421
  )
 
 
 
 
422
 
423
- # Calculate consistency score as a pandas Series aligned with other metrics
424
- cs_result = pd.Series(
425
- "N/A", index=elo_result.scores.index
426
- ) # Initialize with zeros using same index
427
-
428
- # Loop through models and update values
429
- for model in cs_result.index:
430
- # Filter self-matches for this model
431
- self_matches = feedback_df[
432
- (feedback_df["left"] == model) & (feedback_df["right"] == model)
433
- ]
434
- totals = len(self_matches)
435
-
436
- if totals:
437
- # Count non-draw outcomes (wins or losses)
438
- cs_result[model] = round(
439
- self_matches[self_matches["winner"] == evalica.Winner.Draw].shape[0]
440
- / totals,
441
- 2,
442
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
 
444
  # Combine all results into a single DataFrame
445
  leaderboard_data = pd.DataFrame(
446
  {
447
  "Model": elo_result.scores.index,
448
  "Elo Score": elo_result.scores.values,
449
- "Consistency Score": cs_result.values,
 
450
  "Average Win Rate": avr_result.scores.values,
451
  "Bradley-Terry Coefficient": bt_result.scores.values,
452
  "Eigenvector Centrality Value": eigen_result.scores.values,
@@ -459,6 +480,7 @@ def get_leaderboard_data(feedback_entry=None):
459
  leaderboard_data = leaderboard_data.round(
460
  {
461
  "Elo Score": 2,
 
462
  "Average Win Rate": 2,
463
  "Bradley-Terry Coefficient": 2,
464
  "Eigenvector Centrality Value": 2,
@@ -509,12 +531,14 @@ with gr.Blocks() as app:
509
  "Rank",
510
  "Model",
511
  "Elo Score",
512
- "Consistency Score",
 
513
  ],
514
  search_columns=["Model"],
515
  filter_columns=[
516
  "Elo Score",
517
- "Consistency Score",
 
518
  "Average Win Rate",
519
  "Bradley-Terry Coefficient",
520
  "Eigenvector Centrality Value",
@@ -1117,7 +1141,7 @@ with gr.Blocks() as app:
1117
  winner_model = "tie"
1118
 
1119
  # Create feedback entry
1120
- feedback_entry = {
1121
  "left": models_state["left"],
1122
  "right": models_state["right"],
1123
  "winner": winner_model,
@@ -1130,7 +1154,7 @@ with gr.Blocks() as app:
1130
  file_name = now.strftime("%Y%m%d_%H%M%S")
1131
 
1132
  # Save feedback back to the Hugging Face dataset
1133
- save_content_to_hf(feedback_entry, "SE-Arena/votes", folder_name, file_name)
1134
 
1135
  conversation_state["right_chat"][0]["content"] = conversation_state[
1136
  "right_chat"
@@ -1175,7 +1199,7 @@ with gr.Blocks() as app:
1175
  gr.update(
1176
  value="Can't Decide", interactive=True
1177
  ), # [10] Reset feedback radio selection
1178
- get_leaderboard_data(feedback_entry), # [11] Updated leaderboard data
1179
  gr.update(
1180
  visible=True
1181
  ), # [12] Show the thanks_message markdown component
 
296
  return formatted_text
297
 
298
 
299
+ def save_content_to_hf(vote_data, repo_name, folder_name, file_name):
300
  """
301
  Save feedback content to Hugging Face repository organized by quarter.
302
  """
303
  # Serialize the content to JSON and encode it as bytes
304
+ json_content = json.dumps(vote_data, indent=4).encode("utf-8")
305
 
306
  # Create a binary file-like object
307
  file_like_object = io.BytesIO(json_content)
 
334
  Returns:
335
  list: Aggregated feedback data read from the repository.
336
  """
337
+ vote_data = []
338
 
339
  # Get the current year and quarter
340
  now = datetime.now()
 
354
  )
355
  with open(local_path, "r") as f:
356
  data = json.load(f)
357
+ data["timestamp"] = file.split("/")[-1].split(".")[0]
358
+ vote_data.append(data)
359
+ return vote_data
360
 
361
  except:
362
  raise Exception("Error loading feedback data from Hugging Face repository.")
363
 
364
 
365
+ def get_leaderboard_data(vote_entry=None):
366
  # Load feedback data from the Hugging Face repository
367
+ vote_data = load_content_from_hf()
368
+ vote_df = pd.DataFrame(vote_data)
 
 
 
 
369
 
370
  # Concatenate the new feedback with the existing leaderboard data
371
+ if vote_entry is not None:
372
+ vote_df = pd.concat([vote_df, pd.DataFrame([vote_entry])], ignore_index=True)
 
 
373
 
374
+ if vote_df.empty:
375
  return pd.DataFrame(
376
  columns=[
377
  "Rank",
378
  "Model",
379
  "Elo Score",
380
+ "Conversation Efficiency Index",
381
+ "Model Consistency Score",
382
  "Average Win Rate",
383
  "Bradley-Terry Coefficient",
384
  "Eigenvector Centrality Value",
 
388
  )
389
 
390
  # map vote to winner
391
+ vote_df["winner"] = vote_df["winner"].map(
392
  {
393
  "left": evalica.Winner.X,
394
  "right": evalica.Winner.Y,
 
398
 
399
  # Calculate scores using various metrics
400
  avr_result = evalica.average_win_rate(
401
+ vote_df["left"], vote_df["right"], vote_df["winner"]
402
  )
403
  bt_result = evalica.bradley_terry(
404
+ vote_df["left"], vote_df["right"], vote_df["winner"]
 
 
 
 
 
 
 
 
 
405
  )
406
+ newman_result = evalica.newman(vote_df["left"], vote_df["right"], vote_df["winner"])
407
+ eigen_result = evalica.eigen(vote_df["left"], vote_df["right"], vote_df["winner"])
408
+ elo_result = evalica.elo(vote_df["left"], vote_df["right"], vote_df["winner"])
409
  pagerank_result = evalica.pagerank(
410
+ vote_df["left"], vote_df["right"], vote_df["winner"]
411
  )
412
+
413
+ # Load conversation data from the Hugging Face repository
414
+ conversation_data = load_content_from_hf("SE-Arena/conversations")
415
+ conversation_df = pd.DataFrame(conversation_data)
416
 
417
+ # Merge vote data with conversation data
418
+ all_df = pd.merge(
419
+ vote_df, conversation_df, on=["timestamp", "left", "right"], how="inner"
420
+ )
421
+
422
+ # Calculate Conversation Efficiency Indexs more efficiently
423
+ # Create a dictionary to store accumulated scores and counts for each model
424
+ model_rcs_sum = {}
425
+ model_rcs_max = {}
426
+
427
+ # Process each row once and accumulate scores
428
+ for _, row in all_df.iterrows():
429
+ # Determine scores based on winner
430
+ match row["winner"]:
431
+ case evalica.Winner.X:
432
+ left_score = 1.0
433
+ right_score = -1.0
434
+ case evalica.Winner.Y:
435
+ left_score = -1.0
436
+ right_score = 1.0
437
+ case _: # Draw
438
+ left_score = 0.1
439
+ right_score = 0.1
440
+
441
+ # Count rounds for each side
442
+ left_round = sum(1 for msg in row["left_chat"] if msg["role"] == "assistant")
443
+ right_round = sum(1 for msg in row["right_chat"] if msg["role"] == "assistant")
444
+
445
+ left_model = row["left"]
446
+ right_model = row["right"]
447
+
448
+ model_rcs_max[left_model] = model_rcs_max.get(left_model, 0) + 1.0 / left_round
449
+ model_rcs_max[right_model] = model_rcs_max.get(right_model, 0) + 1.0 / right_round
450
+
451
+ # Calculate per-round scores
452
+ model_rcs_sum[left_model] = model_rcs_sum.get(left_model, 0) + left_score / left_round
453
+ model_rcs_sum[right_model] = model_rcs_sum.get(right_model, 0) + right_score / right_round
454
+
455
+ cei_result = {model: model_rcs_sum[model] / model_rcs_max[model] for model in model_rcs_sum}
456
+ cei_result = pd.Series({model: cei_result[model] for model in elo_result.scores.index})
457
+
458
+ self_matches = vote_df[vote_df["left"] == vote_df["right"]]
459
+ model_matches = self_matches.groupby("left")
460
+ draw_counts = model_matches["winner"].apply(lambda x: (x == evalica.Winner.Draw).sum())
461
+ total_counts = model_matches.size()
462
+ mcs_result = (draw_counts / total_counts).round(2).reindex(elo_result.scores.index, fill_value="N/A")
463
 
464
  # Combine all results into a single DataFrame
465
  leaderboard_data = pd.DataFrame(
466
  {
467
  "Model": elo_result.scores.index,
468
  "Elo Score": elo_result.scores.values,
469
+ "Conversation Efficiency Index": cei_result.values,
470
+ "Model Consistency Score": mcs_result.values,
471
  "Average Win Rate": avr_result.scores.values,
472
  "Bradley-Terry Coefficient": bt_result.scores.values,
473
  "Eigenvector Centrality Value": eigen_result.scores.values,
 
480
  leaderboard_data = leaderboard_data.round(
481
  {
482
  "Elo Score": 2,
483
+ "Conversation Efficiency Index": 2,
484
  "Average Win Rate": 2,
485
  "Bradley-Terry Coefficient": 2,
486
  "Eigenvector Centrality Value": 2,
 
531
  "Rank",
532
  "Model",
533
  "Elo Score",
534
+ "Conversation Efficiency Index",
535
+ "Model Consistency Score",
536
  ],
537
  search_columns=["Model"],
538
  filter_columns=[
539
  "Elo Score",
540
+ "Conversation Efficiency Index",
541
+ "Model Consistency Score",
542
  "Average Win Rate",
543
  "Bradley-Terry Coefficient",
544
  "Eigenvector Centrality Value",
 
1141
  winner_model = "tie"
1142
 
1143
  # Create feedback entry
1144
+ vote_entry = {
1145
  "left": models_state["left"],
1146
  "right": models_state["right"],
1147
  "winner": winner_model,
 
1154
  file_name = now.strftime("%Y%m%d_%H%M%S")
1155
 
1156
  # Save feedback back to the Hugging Face dataset
1157
+ save_content_to_hf(vote_entry, "SE-Arena/votes", folder_name, file_name)
1158
 
1159
  conversation_state["right_chat"][0]["content"] = conversation_state[
1160
  "right_chat"
 
1199
  gr.update(
1200
  value="Can't Decide", interactive=True
1201
  ), # [10] Reset feedback radio selection
1202
+ get_leaderboard_data(vote_entry), # [11] Updated leaderboard data
1203
  gr.update(
1204
  visible=True
1205
  ), # [12] Show the thanks_message markdown component