from nc_py_api import Nextcloud import json from typing import Dict, Any import os import time from datetime import datetime import threading import arena_config import sys import math import plotly.graph_objects as go # Initialize Nextcloud client nc = Nextcloud(nextcloud_url=arena_config.NEXTCLOUD_URL, nc_auth_user=arena_config.NEXTCLOUD_USERNAME, nc_auth_pass=arena_config.NEXTCLOUD_PASSWORD) # Dictionary to store ELO ratings elo_ratings = {} def load_leaderboard() -> Dict[str, Any]: try: file_content = nc.files.download(arena_config.NEXTCLOUD_LEADERBOARD_PATH) return json.loads(file_content.decode('utf-8')) except Exception as e: print(f"Error loading leaderboard: {str(e)}") return {} def save_leaderboard(leaderboard_data: Dict[str, Any]) -> bool: try: json_data = json.dumps(leaderboard_data, indent=2) nc.files.upload(arena_config.NEXTCLOUD_LEADERBOARD_PATH, json_data.encode('utf-8')) return True except Exception as e: print(f"Error saving leaderboard: {str(e)}") return False def get_model_size(model_name): for model, human_readable in arena_config.APPROVED_MODELS: if model == model_name: size = float(human_readable.split('(')[1].split('B')[0]) return size return 1.0 # Default size if not found def calculate_expected_score(rating_a, rating_b): return 1 / (1 + math.pow(10, (rating_b - rating_a) / 400)) def update_elo_ratings(winner, loser): if winner not in elo_ratings or loser not in elo_ratings: initialize_elo_ratings() winner_rating = elo_ratings[winner] loser_rating = elo_ratings[loser] expected_winner = calculate_expected_score(winner_rating, loser_rating) expected_loser = 1 - expected_winner winner_size = get_model_size(winner) loser_size = get_model_size(loser) max_size = max(get_model_size(model) for model, _ in arena_config.APPROVED_MODELS) k_factor = min(64, 32 * (1 + (loser_size - winner_size) / max_size)) elo_ratings[winner] += k_factor * (1 - expected_winner) elo_ratings[loser] += k_factor * (0 - expected_loser) def initialize_elo_ratings(): leaderboard = load_leaderboard() for model, _ in arena_config.APPROVED_MODELS: size = get_model_size(model) elo_ratings[model] = 1000 + (size * 100) # Replay all battles to update ELO ratings for model, data in leaderboard.items(): if model not in elo_ratings: elo_ratings[model] = 1000 + (get_model_size(model) * 100) for opponent, results in data['opponents'].items(): if opponent not in elo_ratings: elo_ratings[opponent] = 1000 + (get_model_size(opponent) * 100) for _ in range(results['wins']): update_elo_ratings(model, opponent) for _ in range(results['losses']): update_elo_ratings(opponent, model) def ensure_elo_ratings_initialized(): if not elo_ratings: initialize_elo_ratings() def update_leaderboard(winner: str, loser: str) -> Dict[str, Any]: leaderboard = load_leaderboard() if winner not in leaderboard: leaderboard[winner] = {"wins": 0, "losses": 0, "opponents": {}} if loser not in leaderboard: leaderboard[loser] = {"wins": 0, "losses": 0, "opponents": {}} leaderboard[winner]["wins"] += 1 leaderboard[winner]["opponents"].setdefault(loser, {"wins": 0, "losses": 0})["wins"] += 1 leaderboard[loser]["losses"] += 1 leaderboard[loser]["opponents"].setdefault(winner, {"wins": 0, "losses": 0})["losses"] += 1 # Update ELO ratings update_elo_ratings(winner, loser) save_leaderboard(leaderboard) return leaderboard def get_current_leaderboard() -> Dict[str, Any]: return load_leaderboard() def get_human_readable_name(model_name: str) -> str: model_dict = dict(arena_config.APPROVED_MODELS) return model_dict.get(model_name, model_name) def get_leaderboard(): leaderboard = load_leaderboard() # Calculate scores for each model for model, results in leaderboard.items(): total_battles = results["wins"] + results["losses"] if total_battles > 0: win_rate = results["wins"] / total_battles results["score"] = win_rate * (1 - 1 / (total_battles + 1)) else: results["score"] = 0 # Sort results by score, then by total battles sorted_results = sorted( leaderboard.items(), key=lambda x: (x[1]["score"], x[1]["wins"] + x[1]["losses"]), reverse=True ) # Explanation of the main leaderboard explanation = """
This leaderboard uses a scoring system that balances win rate and total battles. The score is calculated using the formula:
Score = Win Rate * (1 - 1 / (Total Battles + 1))
This formula rewards models with higher win rates and more battles. As the number of battles increases, the score approaches the win rate.
Rank | Model | Score | Wins | Losses | Win Rate | Total Battles | Top Rival | Toughest Opponent |
---|---|---|---|---|---|---|---|---|
{rank_display} | {get_human_readable_name(model)} | {results['score']:.4f} | {results['wins']} | {results['losses']} | {win_rate:.2f}% | {total_battles} | {top_rival_name} (W: {top_rival_wins}) | {toughest_opponent_name} (L: {toughest_opponent_losses}) |
This leaderboard uses a modified ELO rating system that takes into account both the performance and size of the models. Initial ratings range from {round(min_initial_rating)} to {round(max_initial_rating)} points, based on model size, with larger models starting at higher ratings. The "Positive Impact" score reflects the significance of wins, with higher scores for defeating larger models. The "Negative Impact" score indicates the significance of losses, with higher scores for losing against smaller models. The current ELO rating is calculated based on these impacts and the model's performance history.
""" leaderboard_html = f""" {explanation_elo}Rank | Model | Current ELO Rating | Positive Impact | Negative Impact | Total Battles | Initial Rating |
---|---|---|---|---|---|---|
{rank_display} | {get_human_readable_name(data['model'])} | {round(data['current_rating'])} | {data['positive_impact']} | {data['negative_impact']} | {data['total_battles']} | {round(data['initial_rating'])} |