add Conversation Efficiency Index
Browse files
app.py
CHANGED
@@ -296,12 +296,12 @@ def format_conversation_history(conversation_history):
|
|
296 |
return formatted_text
|
297 |
|
298 |
|
299 |
-
def save_content_to_hf(
|
300 |
"""
|
301 |
Save feedback content to Hugging Face repository organized by quarter.
|
302 |
"""
|
303 |
# Serialize the content to JSON and encode it as bytes
|
304 |
-
json_content = json.dumps(
|
305 |
|
306 |
# Create a binary file-like object
|
307 |
file_like_object = io.BytesIO(json_content)
|
@@ -334,7 +334,7 @@ def load_content_from_hf(repo_name="SE-Arena/votes"):
|
|
334 |
Returns:
|
335 |
list: Aggregated feedback data read from the repository.
|
336 |
"""
|
337 |
-
|
338 |
|
339 |
# Get the current year and quarter
|
340 |
now = datetime.now()
|
@@ -354,35 +354,31 @@ def load_content_from_hf(repo_name="SE-Arena/votes"):
|
|
354 |
)
|
355 |
with open(local_path, "r") as f:
|
356 |
data = json.load(f)
|
357 |
-
|
358 |
-
|
|
|
359 |
|
360 |
except:
|
361 |
raise Exception("Error loading feedback data from Hugging Face repository.")
|
362 |
|
363 |
|
364 |
-
def get_leaderboard_data(
|
365 |
# Load feedback data from the Hugging Face repository
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
# Load conversation data from the Hugging Face repository
|
370 |
-
conversation_data = load_content_from_hf("SE-Arena/conversations")
|
371 |
-
conversation_df = pd.DataFrame(conversation_data)
|
372 |
|
373 |
# Concatenate the new feedback with the existing leaderboard data
|
374 |
-
if
|
375 |
-
|
376 |
-
[feedback_df, pd.DataFrame([feedback_entry])], ignore_index=True
|
377 |
-
)
|
378 |
|
379 |
-
if
|
380 |
return pd.DataFrame(
|
381 |
columns=[
|
382 |
"Rank",
|
383 |
"Model",
|
384 |
"Elo Score",
|
385 |
-
"
|
|
|
386 |
"Average Win Rate",
|
387 |
"Bradley-Terry Coefficient",
|
388 |
"Eigenvector Centrality Value",
|
@@ -392,7 +388,7 @@ def get_leaderboard_data(feedback_entry=None):
|
|
392 |
)
|
393 |
|
394 |
# map vote to winner
|
395 |
-
|
396 |
{
|
397 |
"left": evalica.Winner.X,
|
398 |
"right": evalica.Winner.Y,
|
@@ -402,51 +398,76 @@ def get_leaderboard_data(feedback_entry=None):
|
|
402 |
|
403 |
# Calculate scores using various metrics
|
404 |
avr_result = evalica.average_win_rate(
|
405 |
-
|
406 |
)
|
407 |
bt_result = evalica.bradley_terry(
|
408 |
-
|
409 |
-
)
|
410 |
-
newman_result = evalica.newman(
|
411 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
412 |
-
)
|
413 |
-
eigen_result = evalica.eigen(
|
414 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
415 |
-
)
|
416 |
-
elo_result = evalica.elo(
|
417 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
418 |
)
|
|
|
|
|
|
|
419 |
pagerank_result = evalica.pagerank(
|
420 |
-
|
421 |
)
|
|
|
|
|
|
|
|
|
422 |
|
423 |
-
#
|
424 |
-
|
425 |
-
"
|
426 |
-
)
|
427 |
-
|
428 |
-
#
|
429 |
-
for model
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
443 |
|
444 |
# Combine all results into a single DataFrame
|
445 |
leaderboard_data = pd.DataFrame(
|
446 |
{
|
447 |
"Model": elo_result.scores.index,
|
448 |
"Elo Score": elo_result.scores.values,
|
449 |
-
"
|
|
|
450 |
"Average Win Rate": avr_result.scores.values,
|
451 |
"Bradley-Terry Coefficient": bt_result.scores.values,
|
452 |
"Eigenvector Centrality Value": eigen_result.scores.values,
|
@@ -459,6 +480,7 @@ def get_leaderboard_data(feedback_entry=None):
|
|
459 |
leaderboard_data = leaderboard_data.round(
|
460 |
{
|
461 |
"Elo Score": 2,
|
|
|
462 |
"Average Win Rate": 2,
|
463 |
"Bradley-Terry Coefficient": 2,
|
464 |
"Eigenvector Centrality Value": 2,
|
@@ -509,12 +531,14 @@ with gr.Blocks() as app:
|
|
509 |
"Rank",
|
510 |
"Model",
|
511 |
"Elo Score",
|
512 |
-
"
|
|
|
513 |
],
|
514 |
search_columns=["Model"],
|
515 |
filter_columns=[
|
516 |
"Elo Score",
|
517 |
-
"
|
|
|
518 |
"Average Win Rate",
|
519 |
"Bradley-Terry Coefficient",
|
520 |
"Eigenvector Centrality Value",
|
@@ -1117,7 +1141,7 @@ with gr.Blocks() as app:
|
|
1117 |
winner_model = "tie"
|
1118 |
|
1119 |
# Create feedback entry
|
1120 |
-
|
1121 |
"left": models_state["left"],
|
1122 |
"right": models_state["right"],
|
1123 |
"winner": winner_model,
|
@@ -1130,7 +1154,7 @@ with gr.Blocks() as app:
|
|
1130 |
file_name = now.strftime("%Y%m%d_%H%M%S")
|
1131 |
|
1132 |
# Save feedback back to the Hugging Face dataset
|
1133 |
-
save_content_to_hf(
|
1134 |
|
1135 |
conversation_state["right_chat"][0]["content"] = conversation_state[
|
1136 |
"right_chat"
|
@@ -1175,7 +1199,7 @@ with gr.Blocks() as app:
|
|
1175 |
gr.update(
|
1176 |
value="Can't Decide", interactive=True
|
1177 |
), # [10] Reset feedback radio selection
|
1178 |
-
get_leaderboard_data(
|
1179 |
gr.update(
|
1180 |
visible=True
|
1181 |
), # [12] Show the thanks_message markdown component
|
|
|
296 |
return formatted_text
|
297 |
|
298 |
|
299 |
+
def save_content_to_hf(vote_data, repo_name, folder_name, file_name):
|
300 |
"""
|
301 |
Save feedback content to Hugging Face repository organized by quarter.
|
302 |
"""
|
303 |
# Serialize the content to JSON and encode it as bytes
|
304 |
+
json_content = json.dumps(vote_data, indent=4).encode("utf-8")
|
305 |
|
306 |
# Create a binary file-like object
|
307 |
file_like_object = io.BytesIO(json_content)
|
|
|
334 |
Returns:
|
335 |
list: Aggregated feedback data read from the repository.
|
336 |
"""
|
337 |
+
vote_data = []
|
338 |
|
339 |
# Get the current year and quarter
|
340 |
now = datetime.now()
|
|
|
354 |
)
|
355 |
with open(local_path, "r") as f:
|
356 |
data = json.load(f)
|
357 |
+
data["timestamp"] = file.split("/")[-1].split(".")[0]
|
358 |
+
vote_data.append(data)
|
359 |
+
return vote_data
|
360 |
|
361 |
except:
|
362 |
raise Exception("Error loading feedback data from Hugging Face repository.")
|
363 |
|
364 |
|
365 |
+
def get_leaderboard_data(vote_entry=None):
|
366 |
# Load feedback data from the Hugging Face repository
|
367 |
+
vote_data = load_content_from_hf()
|
368 |
+
vote_df = pd.DataFrame(vote_data)
|
|
|
|
|
|
|
|
|
369 |
|
370 |
# Concatenate the new feedback with the existing leaderboard data
|
371 |
+
if vote_entry is not None:
|
372 |
+
vote_df = pd.concat([vote_df, pd.DataFrame([vote_entry])], ignore_index=True)
|
|
|
|
|
373 |
|
374 |
+
if vote_df.empty:
|
375 |
return pd.DataFrame(
|
376 |
columns=[
|
377 |
"Rank",
|
378 |
"Model",
|
379 |
"Elo Score",
|
380 |
+
"Conversation Efficiency Index",
|
381 |
+
"Model Consistency Score",
|
382 |
"Average Win Rate",
|
383 |
"Bradley-Terry Coefficient",
|
384 |
"Eigenvector Centrality Value",
|
|
|
388 |
)
|
389 |
|
390 |
# map vote to winner
|
391 |
+
vote_df["winner"] = vote_df["winner"].map(
|
392 |
{
|
393 |
"left": evalica.Winner.X,
|
394 |
"right": evalica.Winner.Y,
|
|
|
398 |
|
399 |
# Calculate scores using various metrics
|
400 |
avr_result = evalica.average_win_rate(
|
401 |
+
vote_df["left"], vote_df["right"], vote_df["winner"]
|
402 |
)
|
403 |
bt_result = evalica.bradley_terry(
|
404 |
+
vote_df["left"], vote_df["right"], vote_df["winner"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
405 |
)
|
406 |
+
newman_result = evalica.newman(vote_df["left"], vote_df["right"], vote_df["winner"])
|
407 |
+
eigen_result = evalica.eigen(vote_df["left"], vote_df["right"], vote_df["winner"])
|
408 |
+
elo_result = evalica.elo(vote_df["left"], vote_df["right"], vote_df["winner"])
|
409 |
pagerank_result = evalica.pagerank(
|
410 |
+
vote_df["left"], vote_df["right"], vote_df["winner"]
|
411 |
)
|
412 |
+
|
413 |
+
# Load conversation data from the Hugging Face repository
|
414 |
+
conversation_data = load_content_from_hf("SE-Arena/conversations")
|
415 |
+
conversation_df = pd.DataFrame(conversation_data)
|
416 |
|
417 |
+
# Merge vote data with conversation data
|
418 |
+
all_df = pd.merge(
|
419 |
+
vote_df, conversation_df, on=["timestamp", "left", "right"], how="inner"
|
420 |
+
)
|
421 |
+
|
422 |
+
# Calculate Conversation Efficiency Indexs more efficiently
|
423 |
+
# Create a dictionary to store accumulated scores and counts for each model
|
424 |
+
model_rcs_sum = {}
|
425 |
+
model_rcs_max = {}
|
426 |
+
|
427 |
+
# Process each row once and accumulate scores
|
428 |
+
for _, row in all_df.iterrows():
|
429 |
+
# Determine scores based on winner
|
430 |
+
match row["winner"]:
|
431 |
+
case evalica.Winner.X:
|
432 |
+
left_score = 1.0
|
433 |
+
right_score = -1.0
|
434 |
+
case evalica.Winner.Y:
|
435 |
+
left_score = -1.0
|
436 |
+
right_score = 1.0
|
437 |
+
case _: # Draw
|
438 |
+
left_score = 0.1
|
439 |
+
right_score = 0.1
|
440 |
+
|
441 |
+
# Count rounds for each side
|
442 |
+
left_round = sum(1 for msg in row["left_chat"] if msg["role"] == "assistant")
|
443 |
+
right_round = sum(1 for msg in row["right_chat"] if msg["role"] == "assistant")
|
444 |
+
|
445 |
+
left_model = row["left"]
|
446 |
+
right_model = row["right"]
|
447 |
+
|
448 |
+
model_rcs_max[left_model] = model_rcs_max.get(left_model, 0) + 1.0 / left_round
|
449 |
+
model_rcs_max[right_model] = model_rcs_max.get(right_model, 0) + 1.0 / right_round
|
450 |
+
|
451 |
+
# Calculate per-round scores
|
452 |
+
model_rcs_sum[left_model] = model_rcs_sum.get(left_model, 0) + left_score / left_round
|
453 |
+
model_rcs_sum[right_model] = model_rcs_sum.get(right_model, 0) + right_score / right_round
|
454 |
+
|
455 |
+
cei_result = {model: model_rcs_sum[model] / model_rcs_max[model] for model in model_rcs_sum}
|
456 |
+
cei_result = pd.Series({model: cei_result[model] for model in elo_result.scores.index})
|
457 |
+
|
458 |
+
self_matches = vote_df[vote_df["left"] == vote_df["right"]]
|
459 |
+
model_matches = self_matches.groupby("left")
|
460 |
+
draw_counts = model_matches["winner"].apply(lambda x: (x == evalica.Winner.Draw).sum())
|
461 |
+
total_counts = model_matches.size()
|
462 |
+
mcs_result = (draw_counts / total_counts).round(2).reindex(elo_result.scores.index, fill_value="N/A")
|
463 |
|
464 |
# Combine all results into a single DataFrame
|
465 |
leaderboard_data = pd.DataFrame(
|
466 |
{
|
467 |
"Model": elo_result.scores.index,
|
468 |
"Elo Score": elo_result.scores.values,
|
469 |
+
"Conversation Efficiency Index": cei_result.values,
|
470 |
+
"Model Consistency Score": mcs_result.values,
|
471 |
"Average Win Rate": avr_result.scores.values,
|
472 |
"Bradley-Terry Coefficient": bt_result.scores.values,
|
473 |
"Eigenvector Centrality Value": eigen_result.scores.values,
|
|
|
480 |
leaderboard_data = leaderboard_data.round(
|
481 |
{
|
482 |
"Elo Score": 2,
|
483 |
+
"Conversation Efficiency Index": 2,
|
484 |
"Average Win Rate": 2,
|
485 |
"Bradley-Terry Coefficient": 2,
|
486 |
"Eigenvector Centrality Value": 2,
|
|
|
531 |
"Rank",
|
532 |
"Model",
|
533 |
"Elo Score",
|
534 |
+
"Conversation Efficiency Index",
|
535 |
+
"Model Consistency Score",
|
536 |
],
|
537 |
search_columns=["Model"],
|
538 |
filter_columns=[
|
539 |
"Elo Score",
|
540 |
+
"Conversation Efficiency Index",
|
541 |
+
"Model Consistency Score",
|
542 |
"Average Win Rate",
|
543 |
"Bradley-Terry Coefficient",
|
544 |
"Eigenvector Centrality Value",
|
|
|
1141 |
winner_model = "tie"
|
1142 |
|
1143 |
# Create feedback entry
|
1144 |
+
vote_entry = {
|
1145 |
"left": models_state["left"],
|
1146 |
"right": models_state["right"],
|
1147 |
"winner": winner_model,
|
|
|
1154 |
file_name = now.strftime("%Y%m%d_%H%M%S")
|
1155 |
|
1156 |
# Save feedback back to the Hugging Face dataset
|
1157 |
+
save_content_to_hf(vote_entry, "SE-Arena/votes", folder_name, file_name)
|
1158 |
|
1159 |
conversation_state["right_chat"][0]["content"] = conversation_state[
|
1160 |
"right_chat"
|
|
|
1199 |
gr.update(
|
1200 |
value="Can't Decide", interactive=True
|
1201 |
), # [10] Reset feedback radio selection
|
1202 |
+
get_leaderboard_data(vote_entry), # [11] Updated leaderboard data
|
1203 |
gr.update(
|
1204 |
visible=True
|
1205 |
), # [12] Show the thanks_message markdown component
|