Spaces:

k-mktr
/

gpu-poor-llm-arena

Running

App Files Files Community

k-mktr commited on 21 days ago

Commit

53a3c92

•

1 Parent(s): ba4ccba

Update leaderboard.py

Browse files

Files changed (1) hide show

leaderboard.py +166 -47

leaderboard.py CHANGED Viewed

@@ -6,10 +6,15 @@ import time
 from datetime import datetime
 import threading
 import arena_config
 # Initialize Nextcloud client
 nc = Nextcloud(nextcloud_url=arena_config.NEXTCLOUD_URL, nc_auth_user=arena_config.NEXTCLOUD_USERNAME, nc_auth_pass=arena_config.NEXTCLOUD_PASSWORD)
 def load_leaderboard() -> Dict[str, Any]:
     try:
         file_content = nc.files.download(arena_config.NEXTCLOUD_LEADERBOARD_PATH)
@@ -27,6 +32,53 @@ def save_leaderboard(leaderboard_data: Dict[str, Any]) -> bool:
         print(f"Error saving leaderboard: {str(e)}")
         return False
 def update_leaderboard(winner: str, loser: str) -> Dict[str, Any]:
     leaderboard = load_leaderboard()
@@ -41,62 +93,28 @@ def update_leaderboard(winner: str, loser: str) -> Dict[str, Any]:
     leaderboard[loser]["losses"] += 1
     leaderboard[loser]["opponents"].setdefault(winner, {"wins": 0, "losses": 0})["losses"] += 1
     save_leaderboard(leaderboard)
     return leaderboard
-# Function to get the current leaderboard
 def get_current_leaderboard() -> Dict[str, Any]:
     return load_leaderboard()
-def create_backup():
-    while True:
-        try:
-            leaderboard_data = load_leaderboard()
-            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-            backup_file_name = f"leaderboard_backup_{timestamp}.json"
-            backup_path = f"{arena_config.NEXTCLOUD_BACKUP_FOLDER}/{backup_file_name}"
-            json_data = json.dumps(leaderboard_data, indent=2)
-            nc.files.upload(backup_path, json_data.encode('utf-8'))
-            print(f"Backup created on Nextcloud: {backup_path}")
-        except Exception as e:
-            print(f"Error creating backup: {e}")
-        time.sleep(3600)  # Sleep for 1 hour
-def start_backup_thread():
-    backup_thread = threading.Thread(target=create_backup, daemon=True)
-    backup_thread.start()
 def get_human_readable_name(model_name: str) -> str:
     model_dict = dict(arena_config.APPROVED_MODELS)
     return model_dict.get(model_name, model_name)
 def get_leaderboard():
-    battle_results = get_current_leaderboard()
-    # Calculate scores for each model
-    for model, results in battle_results.items():
-        total_battles = results["wins"] + results["losses"]
-        if total_battles > 0:
-            win_rate = results["wins"] / total_battles
-            results["score"] = win_rate * (1 - 1 / (total_battles + 1))
-        else:
-            results["score"] = 0
-    # Sort results by score, then by total battles
     sorted_results = sorted(
-        battle_results.items(),
-        key=lambda x: (x[1]["score"], x[1]["wins"] + x[1]["losses"]),
         reverse=True
     )
-    leaderboard = """
     <style>
         .leaderboard-table {
             width: 100%;
@@ -125,7 +143,6 @@ def get_leaderboard():
     <tr>
         <th class='rank-column'>Rank</th>
         <th>Model</th>
-        <th>Score</th>
         <th>Wins</th>
         <th>Losses</th>
         <th>Win Rate</th>
@@ -139,7 +156,7 @@ def get_leaderboard():
         total_battles = results["wins"] + results["losses"]
         win_rate = (results["wins"] / total_battles * 100) if total_battles > 0 else 0
-        rank_display = {1: "🥇", 2: "🥈", 3: "🥉", 18: "😞", 19: "😰", 20: "😭"}.get(index, f"{index}")
         top_rival = max(results["opponents"].items(), key=lambda x: x[1]["wins"], default=(None, {"wins": 0}))
         top_rival_name = get_human_readable_name(top_rival[0]) if top_rival[0] else "N/A"
@@ -149,11 +166,10 @@ def get_leaderboard():
         toughest_opponent_name = get_human_readable_name(toughest_opponent[0]) if toughest_opponent[0] else "N/A"
         toughest_opponent_losses = toughest_opponent[1]["losses"]
-        leaderboard += f"""
         <tr>
             <td class='rank-column'>{rank_display}</td>
             <td>{get_human_readable_name(model)}</td>
-            <td>{results['score']:.4f}</td>
             <td>{results['wins']}</td>
             <td>{results['losses']}</td>
             <td>{win_rate:.2f}%</td>
@@ -162,5 +178,108 @@ def get_leaderboard():
             <td class='opponent-details'>{toughest_opponent_name} (L: {toughest_opponent_losses})</td>
         </tr>
         """
-    leaderboard += "</table>"
-    return leaderboard

 from datetime import datetime
 import threading
 import arena_config
+import sys
+import math
 # Initialize Nextcloud client
 nc = Nextcloud(nextcloud_url=arena_config.NEXTCLOUD_URL, nc_auth_user=arena_config.NEXTCLOUD_USERNAME, nc_auth_pass=arena_config.NEXTCLOUD_PASSWORD)
+# Dictionary to store ELO ratings
+elo_ratings = {}
 def load_leaderboard() -> Dict[str, Any]:
     try:
         file_content = nc.files.download(arena_config.NEXTCLOUD_LEADERBOARD_PATH)
         print(f"Error saving leaderboard: {str(e)}")
         return False
+def get_model_size(model_name):
+    for model, human_readable in arena_config.APPROVED_MODELS:
+        if model == model_name:
+            size = float(human_readable.split('(')[1].split('B')[0])
+            return size
+    return 1.0  # Default size if not found
+def calculate_expected_score(rating_a, rating_b):
+    return 1 / (1 + math.pow(10, (rating_b - rating_a) / 400))
+def update_elo_ratings(winner, loser):
+    if winner not in elo_ratings or loser not in elo_ratings:
+        initialize_elo_ratings()
+    winner_rating = elo_ratings[winner]
+    loser_rating = elo_ratings[loser]
+    expected_winner = calculate_expected_score(winner_rating, loser_rating)
+    expected_loser = 1 - expected_winner
+    winner_size = get_model_size(winner)
+    loser_size = get_model_size(loser)
+    max_size = max(get_model_size(model) for model, _ in arena_config.APPROVED_MODELS)
+    k_factor = 32 * (1 + (loser_size - winner_size) / max_size)
+    elo_ratings[winner] += k_factor * (1 - expected_winner)
+    elo_ratings[loser] += k_factor * (0 - expected_loser)
+def initialize_elo_ratings():
+    leaderboard = load_leaderboard()
+    for model, _ in arena_config.APPROVED_MODELS:
+        size = get_model_size(model)
+        elo_ratings[model] = 1000 + (size * 100)
+    # Replay all battles to update ELO ratings
+    for model, data in leaderboard.items():
+        for opponent, results in data['opponents'].items():
+            for _ in range(results['wins']):
+                update_elo_ratings(model, opponent)
+            for _ in range(results['losses']):
+                update_elo_ratings(opponent, model)
+def ensure_elo_ratings_initialized():
+    if not elo_ratings:
+        initialize_elo_ratings()
 def update_leaderboard(winner: str, loser: str) -> Dict[str, Any]:
     leaderboard = load_leaderboard()
     leaderboard[loser]["losses"] += 1
     leaderboard[loser]["opponents"].setdefault(winner, {"wins": 0, "losses": 0})["losses"] += 1
+    # Update ELO ratings
+    update_elo_ratings(winner, loser)
     save_leaderboard(leaderboard)
     return leaderboard
 def get_current_leaderboard() -> Dict[str, Any]:
     return load_leaderboard()
 def get_human_readable_name(model_name: str) -> str:
     model_dict = dict(arena_config.APPROVED_MODELS)
     return model_dict.get(model_name, model_name)
 def get_leaderboard():
+    leaderboard = load_leaderboard()
     sorted_results = sorted(
+        leaderboard.items(),
+        key=lambda x: (x[1]["wins"] / (x[1]["wins"] + x[1]["losses"]) if x[1]["wins"] + x[1]["losses"] > 0 else 0, x[1]["wins"] + x[1]["losses"]),
         reverse=True
     )
+    leaderboard_html = """
     <style>
         .leaderboard-table {
             width: 100%;
     <tr>
         <th class='rank-column'>Rank</th>
         <th>Model</th>
         <th>Wins</th>
         <th>Losses</th>
         <th>Win Rate</th>
         total_battles = results["wins"] + results["losses"]
         win_rate = (results["wins"] / total_battles * 100) if total_battles > 0 else 0
+        rank_display = {1: "🥇", 2: "🥈", 3: "🥉"}.get(index, f"{index}")
         top_rival = max(results["opponents"].items(), key=lambda x: x[1]["wins"], default=(None, {"wins": 0}))
         top_rival_name = get_human_readable_name(top_rival[0]) if top_rival[0] else "N/A"
         toughest_opponent_name = get_human_readable_name(toughest_opponent[0]) if toughest_opponent[0] else "N/A"
         toughest_opponent_losses = toughest_opponent[1]["losses"]
+        leaderboard_html += f"""
         <tr>
             <td class='rank-column'>{rank_display}</td>
             <td>{get_human_readable_name(model)}</td>
             <td>{results['wins']}</td>
             <td>{results['losses']}</td>
             <td>{win_rate:.2f}%</td>
             <td class='opponent-details'>{toughest_opponent_name} (L: {toughest_opponent_losses})</td>
         </tr>
         """
+    leaderboard_html += "</table>"
+    return leaderboard_html
+def get_elo_leaderboard():
+    ensure_elo_ratings_initialized()
+    leaderboard = load_leaderboard()
+    sorted_ratings = sorted(elo_ratings.items(), key=lambda x: x[1], reverse=True)
+    min_initial_rating = min(1000 + (get_model_size(model) * 100) for model, _ in arena_config.APPROVED_MODELS)
+    max_initial_rating = max(1000 + (get_model_size(model) * 100) for model, _ in arena_config.APPROVED_MODELS)
+    explanation = f"""
+    <p style="font-size: 16px; margin-bottom: 20px;">
+    This leaderboard uses a modified ELO rating system that takes into account both the performance and size of the models.
+    Initial ratings range from {round(min_initial_rating)} to {round(max_initial_rating)} points, based on model size, with larger models starting at higher ratings.
+    When a smaller model defeats a larger one, it gains more points, while larger models gain fewer points for beating smaller ones.
+    The "Points Scored" column shows the total ELO points gained by the model from its victories, reflecting both quantity and quality of wins.
+    The "Points Lost" column shows the total ELO points lost by the model from its defeats, indicating the challenges faced.
+    </p>
+    """
+    leaderboard_html = f"""
+    {explanation}
+    <style>
+        .elo-leaderboard-table {{
+            width: 100%;
+            border-collapse: collapse;
+            font-family: Arial, sans-serif;
+        }}
+        .elo-leaderboard-table th, .elo-leaderboard-table td {{
+            border: 1px solid #ddd;
+            padding: 8px;
+            text-align: left;
+        }}
+        .elo-leaderboard-table th {{
+            background-color: rgba(255, 255, 255, 0.1);
+            font-weight: bold;
+        }}
+        .rank-column {{
+            width: 60px;
+            text-align: center;
+        }}
+    </style>
+    <table class='elo-leaderboard-table'>
+    <tr>
+        <th class='rank-column'>Rank</th>
+        <th>Model</th>
+        <th>ELO Rating</th>
+        <th>Points Scored</th>
+        <th>Points Lost</th>
+    </tr>
+    """
+    for index, (model, rating) in enumerate(sorted_ratings, start=1):
+        rank_display = {1: "🥇", 2: "🥈", 3: "🥉"}.get(index, f"{index}")
+        model_size = get_model_size(model)
+        points_scored = 0
+        points_lost = 0
+        if model in leaderboard:
+            for opponent, results in leaderboard[model]['opponents'].items():
+                opponent_rating = elo_ratings.get(opponent, 1000)
+                opponent_size = get_model_size(opponent)
+                max_size = max(get_model_size(m) for m, _ in arena_config.APPROVED_MODELS)
+                for _ in range(results['wins']):
+                    expected_score = calculate_expected_score(rating, opponent_rating)
+                    k_factor = 32 * (1 + (opponent_size - model_size) / max_size)
+                    points_scored += k_factor * (1 - expected_score)
+                for _ in range(results['losses']):
+                    expected_score = calculate_expected_score(rating, opponent_rating)
+                    k_factor = 32 * (1 + (model_size - opponent_size) / max_size)
+                    points_lost += k_factor * expected_score
+        leaderboard_html += f"""
+        <tr>
+            <td class='rank-column'>{rank_display}</td>
+            <td>{get_human_readable_name(model)}</td>
+            <td>{round(rating)}</td>
+            <td>{round(points_scored, 2)}</td>
+            <td>{round(points_lost, 2)}</td>
+        </tr>
+        """
+    leaderboard_html += "</table>"
+    return leaderboard_html
+def create_backup():
+    while True:
+        try:
+            leaderboard_data = load_leaderboard()
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            backup_file_name = f"leaderboard_backup_{timestamp}.json"
+            backup_path = f"{arena_config.NEXTCLOUD_BACKUP_FOLDER}/{backup_file_name}"
+            json_data = json.dumps(leaderboard_data, indent=2)
+            nc.files.upload(backup_path, json_data.encode('utf-8'))
+            print(f"Backup created on Nextcloud: {backup_path}")
+        except Exception as e:
+            print(f"Error creating backup: {e}")
+        time.sleep(3600)  # Sleep for 1 HOUR
+def start_backup_thread():
+    backup_thread = threading.Thread(target=create_backup, daemon=True)
+    backup_thread.start()