Spaces:
Runtime error
Runtime error
fix the sync bug
Browse files- app.py +92 -107
- context_window.json +2 -3
app.py
CHANGED
@@ -28,6 +28,9 @@ openai_client = OpenAI(api_key=api_key, base_url=base_url)
|
|
28 |
# Timeout in seconds for model responses
|
29 |
TIMEOUT = 90
|
30 |
|
|
|
|
|
|
|
31 |
# Hint string constant
|
32 |
SHOW_HINT_STRING = True # Set to False to hide the hint string altogether
|
33 |
HINT_STRING = "Once signed in, your votes will be recorded securely."
|
@@ -282,7 +285,7 @@ def chat_with_models(
|
|
282 |
return formatted_response
|
283 |
|
284 |
|
285 |
-
def save_content_to_hf(
|
286 |
"""
|
287 |
Save feedback content to Hugging Face repository organized by month and year.
|
288 |
|
@@ -291,13 +294,8 @@ def save_content_to_hf(content, repo_name):
|
|
291 |
month_year (str): Year and month string in the format "YYYY_MM".
|
292 |
repo_name (str): Hugging Face repository name.
|
293 |
"""
|
294 |
-
# Ensure the user is authenticated with HF
|
295 |
-
token = HfFolder.get_token()
|
296 |
-
if token is None:
|
297 |
-
raise ValueError("Please log in to Hugging Face using `huggingface-cli login`.")
|
298 |
-
|
299 |
# Serialize the content to JSON and encode it as bytes
|
300 |
-
json_content = json.dumps(
|
301 |
|
302 |
# Create a binary file-like object
|
303 |
file_like_object = io.BytesIO(json_content)
|
@@ -309,6 +307,11 @@ def save_content_to_hf(content, repo_name):
|
|
309 |
# Define the path in the repository
|
310 |
filename = f"{month_year}/{day_hour_minute_second}.json"
|
311 |
|
|
|
|
|
|
|
|
|
|
|
312 |
# Upload to Hugging Face repository
|
313 |
upload_file(
|
314 |
path_or_fileobj=file_like_object,
|
@@ -340,15 +343,15 @@ def load_content_from_hf(repo_name="SE-Arena/votes"):
|
|
340 |
repo_files = api.list_repo_files(repo_id=repo_name, repo_type="dataset")
|
341 |
|
342 |
# Filter files by current year and month
|
343 |
-
|
344 |
|
345 |
-
if not
|
346 |
raise FileNotFoundError(
|
347 |
f"No feedback files found for {year_month} in {repo_name}."
|
348 |
)
|
349 |
|
350 |
# Download and aggregate data
|
351 |
-
for file in
|
352 |
local_path = hf_hub_download(
|
353 |
repo_id=repo_name, filename=file, repo_type="dataset"
|
354 |
)
|
@@ -366,100 +369,85 @@ def load_content_from_hf(repo_name="SE-Arena/votes"):
|
|
366 |
|
367 |
|
368 |
def get_leaderboard_data():
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
"Newman Modularity Score",
|
384 |
-
"PageRank Score",
|
385 |
-
]
|
386 |
-
)
|
387 |
-
|
388 |
-
feedback_df["winner"] = feedback_df["winner"].map(
|
389 |
-
{
|
390 |
-
"left": evalica.Winner.X,
|
391 |
-
"right": evalica.Winner.Y,
|
392 |
-
"tie": evalica.Winner.Draw,
|
393 |
-
}
|
394 |
-
)
|
395 |
-
|
396 |
-
# Calculate scores using various metrics
|
397 |
-
avr_result = evalica.average_win_rate(
|
398 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
399 |
-
)
|
400 |
-
bt_result = evalica.bradley_terry(
|
401 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
402 |
-
)
|
403 |
-
newman_result = evalica.newman(
|
404 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
405 |
-
)
|
406 |
-
eigen_result = evalica.eigen(
|
407 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
408 |
-
)
|
409 |
-
elo_result = evalica.elo(
|
410 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
411 |
-
)
|
412 |
-
pagerank_result = evalica.pagerank(
|
413 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
414 |
-
)
|
415 |
-
|
416 |
-
# Combine all results into a single DataFrame
|
417 |
-
ranking_df = pd.DataFrame(
|
418 |
-
{
|
419 |
-
"Model": elo_result.scores.index,
|
420 |
-
"Elo Score": elo_result.scores.values,
|
421 |
-
"Average Win Rate": avr_result.scores.values * 100,
|
422 |
-
"Bradley-Terry Coefficient": bt_result.scores.values,
|
423 |
-
"Eigenvector Centrality Value": eigen_result.scores.values,
|
424 |
-
"PageRank Score": pagerank_result.scores.values,
|
425 |
-
"Newman Modularity Score": newman_result.scores.values,
|
426 |
-
}
|
427 |
-
)
|
428 |
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
433 |
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
|
|
445 |
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
"Newman Modularity Score",
|
458 |
-
"PageRank Score",
|
459 |
-
]
|
460 |
-
]
|
461 |
|
462 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
463 |
|
464 |
|
465 |
# Function to enable or disable submit buttons based on textbox content
|
@@ -916,9 +904,6 @@ with gr.Blocks() as app:
|
|
916 |
)
|
917 |
|
918 |
def submit_feedback(vote, models_state, conversation_state):
|
919 |
-
# Get current timestamp
|
920 |
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
921 |
-
|
922 |
# Map vote to actual model names
|
923 |
match vote:
|
924 |
case "Model A":
|
@@ -933,9 +918,12 @@ with gr.Blocks() as app:
|
|
933 |
"left": models_state["Model A"],
|
934 |
"right": models_state["Model B"],
|
935 |
"winner": winner_model,
|
936 |
-
"timestamp":
|
937 |
}
|
938 |
|
|
|
|
|
|
|
939 |
# Save feedback back to the Hugging Face dataset
|
940 |
save_content_to_hf(feedback_entry, "SE-Arena/votes")
|
941 |
|
@@ -946,9 +934,6 @@ with gr.Blocks() as app:
|
|
946 |
models_state.clear()
|
947 |
conversation_state.clear()
|
948 |
|
949 |
-
# Recalculate leaderboard
|
950 |
-
leaderboard_data = get_leaderboard_data()
|
951 |
-
|
952 |
# Adjust output count to match the interface definition
|
953 |
return (
|
954 |
gr.update(
|
|
|
28 |
# Timeout in seconds for model responses
|
29 |
TIMEOUT = 90
|
30 |
|
31 |
+
# leaderboard data
|
32 |
+
leaderboard_data = None
|
33 |
+
|
34 |
# Hint string constant
|
35 |
SHOW_HINT_STRING = True # Set to False to hide the hint string altogether
|
36 |
HINT_STRING = "Once signed in, your votes will be recorded securely."
|
|
|
285 |
return formatted_response
|
286 |
|
287 |
|
288 |
+
def save_content_to_hf(feedback_data, repo_name):
|
289 |
"""
|
290 |
Save feedback content to Hugging Face repository organized by month and year.
|
291 |
|
|
|
294 |
month_year (str): Year and month string in the format "YYYY_MM".
|
295 |
repo_name (str): Hugging Face repository name.
|
296 |
"""
|
|
|
|
|
|
|
|
|
|
|
297 |
# Serialize the content to JSON and encode it as bytes
|
298 |
+
json_content = json.dumps(feedback_data, indent=4).encode("utf-8")
|
299 |
|
300 |
# Create a binary file-like object
|
301 |
file_like_object = io.BytesIO(json_content)
|
|
|
307 |
# Define the path in the repository
|
308 |
filename = f"{month_year}/{day_hour_minute_second}.json"
|
309 |
|
310 |
+
# Ensure the user is authenticated with HF
|
311 |
+
token = HfFolder.get_token()
|
312 |
+
if token is None:
|
313 |
+
raise ValueError("Please log in to Hugging Face using `huggingface-cli login`.")
|
314 |
+
|
315 |
# Upload to Hugging Face repository
|
316 |
upload_file(
|
317 |
path_or_fileobj=file_like_object,
|
|
|
343 |
repo_files = api.list_repo_files(repo_id=repo_name, repo_type="dataset")
|
344 |
|
345 |
# Filter files by current year and month
|
346 |
+
leaderboard_files = [file for file in repo_files if year_month in file]
|
347 |
|
348 |
+
if not leaderboard_files:
|
349 |
raise FileNotFoundError(
|
350 |
f"No feedback files found for {year_month} in {repo_name}."
|
351 |
)
|
352 |
|
353 |
# Download and aggregate data
|
354 |
+
for file in leaderboard_files:
|
355 |
local_path = hf_hub_download(
|
356 |
repo_id=repo_name, filename=file, repo_type="dataset"
|
357 |
)
|
|
|
369 |
|
370 |
|
371 |
def get_leaderboard_data():
|
372 |
+
if leaderboard_data is None:
|
373 |
+
# Load feedback data from the Hugging Face repository
|
374 |
+
try:
|
375 |
+
feedback_data = load_content_from_hf()
|
376 |
+
feedback_df = pd.DataFrame(feedback_data)
|
377 |
+
|
378 |
+
# map vote to winner
|
379 |
+
feedback_df["winner"] = feedback_df["winner"].map(
|
380 |
+
{
|
381 |
+
"left": evalica.Winner.X,
|
382 |
+
"right": evalica.Winner.Y,
|
383 |
+
"tie": evalica.Winner.Draw,
|
384 |
+
}
|
385 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
386 |
|
387 |
+
# Calculate scores using various metrics
|
388 |
+
avr_result = evalica.average_win_rate(
|
389 |
+
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
390 |
+
)
|
391 |
+
bt_result = evalica.bradley_terry(
|
392 |
+
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
393 |
+
)
|
394 |
+
newman_result = evalica.newman(
|
395 |
+
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
396 |
+
)
|
397 |
+
eigen_result = evalica.eigen(
|
398 |
+
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
399 |
+
)
|
400 |
+
elo_result = evalica.elo(
|
401 |
+
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
402 |
+
)
|
403 |
+
pagerank_result = evalica.pagerank(
|
404 |
+
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
405 |
+
)
|
406 |
|
407 |
+
# Combine all results into a single DataFrame
|
408 |
+
leaderboard_data = pd.DataFrame(
|
409 |
+
{
|
410 |
+
"Model": elo_result.scores.index,
|
411 |
+
"Elo Score": elo_result.scores.values,
|
412 |
+
"Average Win Rate": avr_result.scores.values * 100,
|
413 |
+
"Bradley-Terry Coefficient": bt_result.scores.values,
|
414 |
+
"Eigenvector Centrality Value": eigen_result.scores.values,
|
415 |
+
"Newman Modularity Score": newman_result.scores.values,
|
416 |
+
"PageRank Score": pagerank_result.scores.values,
|
417 |
+
}
|
418 |
+
)
|
419 |
|
420 |
+
# Round all numeric columns to two decimal places
|
421 |
+
leaderboard_data = leaderboard_data.round(
|
422 |
+
{
|
423 |
+
"Elo Score": 2,
|
424 |
+
"Average Win Rate": 2,
|
425 |
+
"Bradley-Terry Coefficient": 2,
|
426 |
+
"Eigenvector Centrality Value": 2,
|
427 |
+
"Newman Modularity Score": 2,
|
428 |
+
"PageRank Score": 2,
|
429 |
+
}
|
430 |
+
)
|
|
|
|
|
|
|
|
|
431 |
|
432 |
+
# Add a Rank column based on Elo scores
|
433 |
+
leaderboard_data["Rank"] = (
|
434 |
+
leaderboard_data["Elo Score"].rank(ascending=False).astype(int)
|
435 |
+
)
|
436 |
+
except:
|
437 |
+
# If no feedback exists, return an empty DataFrame
|
438 |
+
return pd.DataFrame(
|
439 |
+
columns=[
|
440 |
+
"Rank",
|
441 |
+
"Model",
|
442 |
+
"Elo Score",
|
443 |
+
"Average Win Rate",
|
444 |
+
"Bradley-Terry Coefficient",
|
445 |
+
"Eigenvector Centrality Value",
|
446 |
+
"Newman Modularity Score",
|
447 |
+
"PageRank Score",
|
448 |
+
]
|
449 |
+
)
|
450 |
+
return leaderboard_data
|
451 |
|
452 |
|
453 |
# Function to enable or disable submit buttons based on textbox content
|
|
|
904 |
)
|
905 |
|
906 |
def submit_feedback(vote, models_state, conversation_state):
|
|
|
|
|
|
|
907 |
# Map vote to actual model names
|
908 |
match vote:
|
909 |
case "Model A":
|
|
|
918 |
"left": models_state["Model A"],
|
919 |
"right": models_state["Model B"],
|
920 |
"winner": winner_model,
|
921 |
+
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
922 |
}
|
923 |
|
924 |
+
# Concatenate the new feedback with the existing leaderboard data
|
925 |
+
leaderboard_data = pd.concat([get_leaderboard_data(), pd.DataFrame([feedback_entry])], ignore_index=True)
|
926 |
+
|
927 |
# Save feedback back to the Hugging Face dataset
|
928 |
save_content_to_hf(feedback_entry, "SE-Arena/votes")
|
929 |
|
|
|
934 |
models_state.clear()
|
935 |
conversation_state.clear()
|
936 |
|
|
|
|
|
|
|
937 |
# Adjust output count to match the interface definition
|
938 |
return (
|
939 |
gr.update(
|
context_window.json
CHANGED
@@ -14,9 +14,8 @@
|
|
14 |
"llama-3.1-405b": 128000,
|
15 |
"llama-3.1-70b": 128000,
|
16 |
"llama-3.3-70b": 128000,
|
17 |
-
"o1
|
18 |
-
"o1-mini
|
19 |
-
"Qwen2-72B-Instruct": 131072,
|
20 |
"Qwen2.5-32B-Instruct": 131072,
|
21 |
"qwen2.5-72b": 32768,
|
22 |
"Qwen2.5-72B-Instruct": 131072,
|
|
|
14 |
"llama-3.1-405b": 128000,
|
15 |
"llama-3.1-70b": 128000,
|
16 |
"llama-3.3-70b": 128000,
|
17 |
+
"o1": 128000,
|
18 |
+
"o1-mini": 128000,
|
|
|
19 |
"Qwen2.5-32B-Instruct": 131072,
|
20 |
"qwen2.5-72b": 32768,
|
21 |
"Qwen2.5-72B-Instruct": 131072,
|