zhiminy commited on
Commit
b52baf5
·
1 Parent(s): 49865eb

fix the sync bug

Browse files
Files changed (2) hide show
  1. app.py +92 -107
  2. context_window.json +2 -3
app.py CHANGED
@@ -28,6 +28,9 @@ openai_client = OpenAI(api_key=api_key, base_url=base_url)
28
  # Timeout in seconds for model responses
29
  TIMEOUT = 90
30
 
 
 
 
31
  # Hint string constant
32
  SHOW_HINT_STRING = True # Set to False to hide the hint string altogether
33
  HINT_STRING = "Once signed in, your votes will be recorded securely."
@@ -282,7 +285,7 @@ def chat_with_models(
282
  return formatted_response
283
 
284
 
285
- def save_content_to_hf(content, repo_name):
286
  """
287
  Save feedback content to Hugging Face repository organized by month and year.
288
 
@@ -291,13 +294,8 @@ def save_content_to_hf(content, repo_name):
291
  month_year (str): Year and month string in the format "YYYY_MM".
292
  repo_name (str): Hugging Face repository name.
293
  """
294
- # Ensure the user is authenticated with HF
295
- token = HfFolder.get_token()
296
- if token is None:
297
- raise ValueError("Please log in to Hugging Face using `huggingface-cli login`.")
298
-
299
  # Serialize the content to JSON and encode it as bytes
300
- json_content = json.dumps(content, indent=4).encode("utf-8")
301
 
302
  # Create a binary file-like object
303
  file_like_object = io.BytesIO(json_content)
@@ -309,6 +307,11 @@ def save_content_to_hf(content, repo_name):
309
  # Define the path in the repository
310
  filename = f"{month_year}/{day_hour_minute_second}.json"
311
 
 
 
 
 
 
312
  # Upload to Hugging Face repository
313
  upload_file(
314
  path_or_fileobj=file_like_object,
@@ -340,15 +343,15 @@ def load_content_from_hf(repo_name="SE-Arena/votes"):
340
  repo_files = api.list_repo_files(repo_id=repo_name, repo_type="dataset")
341
 
342
  # Filter files by current year and month
343
- feedback_files = [file for file in repo_files if year_month in file]
344
 
345
- if not feedback_files:
346
  raise FileNotFoundError(
347
  f"No feedback files found for {year_month} in {repo_name}."
348
  )
349
 
350
  # Download and aggregate data
351
- for file in feedback_files:
352
  local_path = hf_hub_download(
353
  repo_id=repo_name, filename=file, repo_type="dataset"
354
  )
@@ -366,100 +369,85 @@ def load_content_from_hf(repo_name="SE-Arena/votes"):
366
 
367
 
368
  def get_leaderboard_data():
369
- # Load feedback data from the Hugging Face repository
370
- try:
371
- feedback_data = load_content_from_hf()
372
- feedback_df = pd.DataFrame(feedback_data)
373
- except:
374
- # If no feedback exists, return an empty DataFrame
375
- return pd.DataFrame(
376
- columns=[
377
- "Rank",
378
- "Model",
379
- "Elo Score",
380
- "Average Win Rate",
381
- "Bradley-Terry Coefficient",
382
- "Eigenvector Centrality Value",
383
- "Newman Modularity Score",
384
- "PageRank Score",
385
- ]
386
- )
387
-
388
- feedback_df["winner"] = feedback_df["winner"].map(
389
- {
390
- "left": evalica.Winner.X,
391
- "right": evalica.Winner.Y,
392
- "tie": evalica.Winner.Draw,
393
- }
394
- )
395
-
396
- # Calculate scores using various metrics
397
- avr_result = evalica.average_win_rate(
398
- feedback_df["left"], feedback_df["right"], feedback_df["winner"]
399
- )
400
- bt_result = evalica.bradley_terry(
401
- feedback_df["left"], feedback_df["right"], feedback_df["winner"]
402
- )
403
- newman_result = evalica.newman(
404
- feedback_df["left"], feedback_df["right"], feedback_df["winner"]
405
- )
406
- eigen_result = evalica.eigen(
407
- feedback_df["left"], feedback_df["right"], feedback_df["winner"]
408
- )
409
- elo_result = evalica.elo(
410
- feedback_df["left"], feedback_df["right"], feedback_df["winner"]
411
- )
412
- pagerank_result = evalica.pagerank(
413
- feedback_df["left"], feedback_df["right"], feedback_df["winner"]
414
- )
415
-
416
- # Combine all results into a single DataFrame
417
- ranking_df = pd.DataFrame(
418
- {
419
- "Model": elo_result.scores.index,
420
- "Elo Score": elo_result.scores.values,
421
- "Average Win Rate": avr_result.scores.values * 100,
422
- "Bradley-Terry Coefficient": bt_result.scores.values,
423
- "Eigenvector Centrality Value": eigen_result.scores.values,
424
- "PageRank Score": pagerank_result.scores.values,
425
- "Newman Modularity Score": newman_result.scores.values,
426
- }
427
- )
428
 
429
- # Add a Rank column based on Elo scores
430
- ranking_df["Rank"] = (
431
- ranking_df["Elo Score"].rank(ascending=False, method="min").astype(int)
432
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
 
434
- # Round all numeric columns to two decimal places
435
- ranking_df = ranking_df.round(
436
- {
437
- "Elo Score": 2,
438
- "Average Win Rate": 2,
439
- "Bradley-Terry Coefficient": 2,
440
- "Eigenvector Centrality Value": 2,
441
- "PageRank Score": 2,
442
- "Newman Modularity Score": 2,
443
- }
444
- )
 
445
 
446
- # Reorder columns to make 'Rank' the first column
447
- ranking_df = ranking_df.sort_values(by="Rank").reset_index(drop=True)
448
-
449
- ranking_df = ranking_df[
450
- [
451
- "Rank",
452
- "Model",
453
- "Elo Score",
454
- "Average Win Rate",
455
- "Bradley-Terry Coefficient",
456
- "Eigenvector Centrality Value",
457
- "Newman Modularity Score",
458
- "PageRank Score",
459
- ]
460
- ]
461
 
462
- return ranking_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
 
464
 
465
  # Function to enable or disable submit buttons based on textbox content
@@ -916,9 +904,6 @@ with gr.Blocks() as app:
916
  )
917
 
918
  def submit_feedback(vote, models_state, conversation_state):
919
- # Get current timestamp
920
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
921
-
922
  # Map vote to actual model names
923
  match vote:
924
  case "Model A":
@@ -933,9 +918,12 @@ with gr.Blocks() as app:
933
  "left": models_state["Model A"],
934
  "right": models_state["Model B"],
935
  "winner": winner_model,
936
- "timestamp": timestamp,
937
  }
938
 
 
 
 
939
  # Save feedback back to the Hugging Face dataset
940
  save_content_to_hf(feedback_entry, "SE-Arena/votes")
941
 
@@ -946,9 +934,6 @@ with gr.Blocks() as app:
946
  models_state.clear()
947
  conversation_state.clear()
948
 
949
- # Recalculate leaderboard
950
- leaderboard_data = get_leaderboard_data()
951
-
952
  # Adjust output count to match the interface definition
953
  return (
954
  gr.update(
 
28
  # Timeout in seconds for model responses
29
  TIMEOUT = 90
30
 
31
+ # leaderboard data
32
+ leaderboard_data = None
33
+
34
  # Hint string constant
35
  SHOW_HINT_STRING = True # Set to False to hide the hint string altogether
36
  HINT_STRING = "Once signed in, your votes will be recorded securely."
 
285
  return formatted_response
286
 
287
 
288
+ def save_content_to_hf(feedback_data, repo_name):
289
  """
290
  Save feedback content to Hugging Face repository organized by month and year.
291
 
 
294
  month_year (str): Year and month string in the format "YYYY_MM".
295
  repo_name (str): Hugging Face repository name.
296
  """
 
 
 
 
 
297
  # Serialize the content to JSON and encode it as bytes
298
+ json_content = json.dumps(feedback_data, indent=4).encode("utf-8")
299
 
300
  # Create a binary file-like object
301
  file_like_object = io.BytesIO(json_content)
 
307
  # Define the path in the repository
308
  filename = f"{month_year}/{day_hour_minute_second}.json"
309
 
310
+ # Ensure the user is authenticated with HF
311
+ token = HfFolder.get_token()
312
+ if token is None:
313
+ raise ValueError("Please log in to Hugging Face using `huggingface-cli login`.")
314
+
315
  # Upload to Hugging Face repository
316
  upload_file(
317
  path_or_fileobj=file_like_object,
 
343
  repo_files = api.list_repo_files(repo_id=repo_name, repo_type="dataset")
344
 
345
  # Filter files by current year and month
346
+ leaderboard_files = [file for file in repo_files if year_month in file]
347
 
348
+ if not leaderboard_files:
349
  raise FileNotFoundError(
350
  f"No feedback files found for {year_month} in {repo_name}."
351
  )
352
 
353
  # Download and aggregate data
354
+ for file in leaderboard_files:
355
  local_path = hf_hub_download(
356
  repo_id=repo_name, filename=file, repo_type="dataset"
357
  )
 
369
 
370
 
371
  def get_leaderboard_data():
372
+ if leaderboard_data is None:
373
+ # Load feedback data from the Hugging Face repository
374
+ try:
375
+ feedback_data = load_content_from_hf()
376
+ feedback_df = pd.DataFrame(feedback_data)
377
+
378
+ # map vote to winner
379
+ feedback_df["winner"] = feedback_df["winner"].map(
380
+ {
381
+ "left": evalica.Winner.X,
382
+ "right": evalica.Winner.Y,
383
+ "tie": evalica.Winner.Draw,
384
+ }
385
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
 
387
+ # Calculate scores using various metrics
388
+ avr_result = evalica.average_win_rate(
389
+ feedback_df["left"], feedback_df["right"], feedback_df["winner"]
390
+ )
391
+ bt_result = evalica.bradley_terry(
392
+ feedback_df["left"], feedback_df["right"], feedback_df["winner"]
393
+ )
394
+ newman_result = evalica.newman(
395
+ feedback_df["left"], feedback_df["right"], feedback_df["winner"]
396
+ )
397
+ eigen_result = evalica.eigen(
398
+ feedback_df["left"], feedback_df["right"], feedback_df["winner"]
399
+ )
400
+ elo_result = evalica.elo(
401
+ feedback_df["left"], feedback_df["right"], feedback_df["winner"]
402
+ )
403
+ pagerank_result = evalica.pagerank(
404
+ feedback_df["left"], feedback_df["right"], feedback_df["winner"]
405
+ )
406
 
407
+ # Combine all results into a single DataFrame
408
+ leaderboard_data = pd.DataFrame(
409
+ {
410
+ "Model": elo_result.scores.index,
411
+ "Elo Score": elo_result.scores.values,
412
+ "Average Win Rate": avr_result.scores.values * 100,
413
+ "Bradley-Terry Coefficient": bt_result.scores.values,
414
+ "Eigenvector Centrality Value": eigen_result.scores.values,
415
+ "Newman Modularity Score": newman_result.scores.values,
416
+ "PageRank Score": pagerank_result.scores.values,
417
+ }
418
+ )
419
 
420
+ # Round all numeric columns to two decimal places
421
+ leaderboard_data = leaderboard_data.round(
422
+ {
423
+ "Elo Score": 2,
424
+ "Average Win Rate": 2,
425
+ "Bradley-Terry Coefficient": 2,
426
+ "Eigenvector Centrality Value": 2,
427
+ "Newman Modularity Score": 2,
428
+ "PageRank Score": 2,
429
+ }
430
+ )
 
 
 
 
431
 
432
+ # Add a Rank column based on Elo scores
433
+ leaderboard_data["Rank"] = (
434
+ leaderboard_data["Elo Score"].rank(ascending=False).astype(int)
435
+ )
436
+ except:
437
+ # If no feedback exists, return an empty DataFrame
438
+ return pd.DataFrame(
439
+ columns=[
440
+ "Rank",
441
+ "Model",
442
+ "Elo Score",
443
+ "Average Win Rate",
444
+ "Bradley-Terry Coefficient",
445
+ "Eigenvector Centrality Value",
446
+ "Newman Modularity Score",
447
+ "PageRank Score",
448
+ ]
449
+ )
450
+ return leaderboard_data
451
 
452
 
453
  # Function to enable or disable submit buttons based on textbox content
 
904
  )
905
 
906
  def submit_feedback(vote, models_state, conversation_state):
 
 
 
907
  # Map vote to actual model names
908
  match vote:
909
  case "Model A":
 
918
  "left": models_state["Model A"],
919
  "right": models_state["Model B"],
920
  "winner": winner_model,
921
+ "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
922
  }
923
 
924
+ # Concatenate the new feedback with the existing leaderboard data
925
+ leaderboard_data = pd.concat([get_leaderboard_data(), pd.DataFrame([feedback_entry])], ignore_index=True)
926
+
927
  # Save feedback back to the Hugging Face dataset
928
  save_content_to_hf(feedback_entry, "SE-Arena/votes")
929
 
 
934
  models_state.clear()
935
  conversation_state.clear()
936
 
 
 
 
937
  # Adjust output count to match the interface definition
938
  return (
939
  gr.update(
context_window.json CHANGED
@@ -14,9 +14,8 @@
14
  "llama-3.1-405b": 128000,
15
  "llama-3.1-70b": 128000,
16
  "llama-3.3-70b": 128000,
17
- "o1-all": 128000,
18
- "o1-mini-all": 128000,
19
- "Qwen2-72B-Instruct": 131072,
20
  "Qwen2.5-32B-Instruct": 131072,
21
  "qwen2.5-72b": 32768,
22
  "Qwen2.5-72B-Instruct": 131072,
 
14
  "llama-3.1-405b": 128000,
15
  "llama-3.1-70b": 128000,
16
  "llama-3.3-70b": 128000,
17
+ "o1": 128000,
18
+ "o1-mini": 128000,
 
19
  "Qwen2.5-32B-Instruct": 131072,
20
  "qwen2.5-72b": 32768,
21
  "Qwen2.5-72B-Instruct": 131072,