from nc_py_api import Nextcloud import json from typing import Dict, Any import os import time from datetime import datetime import threading import arena_config import sys import math import plotly.graph_objects as go # Initialize Nextcloud client nc = Nextcloud(nextcloud_url=arena_config.NEXTCLOUD_URL, nc_auth_user=arena_config.NEXTCLOUD_USERNAME, nc_auth_pass=arena_config.NEXTCLOUD_PASSWORD) # Dictionary to store ELO ratings elo_ratings = {} def load_leaderboard() -> Dict[str, Any]: try: file_content = nc.files.download(arena_config.NEXTCLOUD_LEADERBOARD_PATH) return json.loads(file_content.decode('utf-8')) except Exception as e: print(f"Error loading leaderboard: {str(e)}") return {} def save_leaderboard(leaderboard_data: Dict[str, Any]) -> bool: try: json_data = json.dumps(leaderboard_data, indent=2) nc.files.upload(arena_config.NEXTCLOUD_LEADERBOARD_PATH, json_data.encode('utf-8')) return True except Exception as e: print(f"Error saving leaderboard: {str(e)}") return False def get_model_size(model_name): for model, human_readable in arena_config.APPROVED_MODELS: if model == model_name: size = float(human_readable.split('(')[1].split('B')[0]) return size return 1.0 # Default size if not found def calculate_expected_score(rating_a, rating_b): return 1 / (1 + math.pow(10, (rating_b - rating_a) / 400)) def update_elo_ratings(winner, loser): if winner not in elo_ratings or loser not in elo_ratings: initialize_elo_ratings() winner_rating = elo_ratings[winner] loser_rating = elo_ratings[loser] expected_winner = calculate_expected_score(winner_rating, loser_rating) expected_loser = 1 - expected_winner winner_size = get_model_size(winner) loser_size = get_model_size(loser) max_size = max(get_model_size(model) for model, _ in arena_config.APPROVED_MODELS) k_factor = min(64, 32 * (1 + (loser_size - winner_size) / max_size)) elo_ratings[winner] += k_factor * (1 - expected_winner) elo_ratings[loser] += k_factor * (0 - expected_loser) def initialize_elo_ratings(): leaderboard = load_leaderboard() for model, _ in arena_config.APPROVED_MODELS: size = get_model_size(model) elo_ratings[model] = 1000 + (size * 100) # Replay all battles to update ELO ratings for model, data in leaderboard.items(): for opponent, results in data['opponents'].items(): for _ in range(results['wins']): update_elo_ratings(model, opponent) for _ in range(results['losses']): update_elo_ratings(opponent, model) def ensure_elo_ratings_initialized(): if not elo_ratings: initialize_elo_ratings() def update_leaderboard(winner: str, loser: str) -> Dict[str, Any]: leaderboard = load_leaderboard() if winner not in leaderboard: leaderboard[winner] = {"wins": 0, "losses": 0, "opponents": {}} if loser not in leaderboard: leaderboard[loser] = {"wins": 0, "losses": 0, "opponents": {}} leaderboard[winner]["wins"] += 1 leaderboard[winner]["opponents"].setdefault(loser, {"wins": 0, "losses": 0})["wins"] += 1 leaderboard[loser]["losses"] += 1 leaderboard[loser]["opponents"].setdefault(winner, {"wins": 0, "losses": 0})["losses"] += 1 # Update ELO ratings update_elo_ratings(winner, loser) save_leaderboard(leaderboard) return leaderboard def get_current_leaderboard() -> Dict[str, Any]: return load_leaderboard() def get_human_readable_name(model_name: str) -> str: model_dict = dict(arena_config.APPROVED_MODELS) return model_dict.get(model_name, model_name) def get_leaderboard(): leaderboard = load_leaderboard() # Calculate scores for each model for model, results in leaderboard.items(): total_battles = results["wins"] + results["losses"] if total_battles > 0: win_rate = results["wins"] / total_battles results["score"] = win_rate * (1 - 1 / (total_battles + 1)) else: results["score"] = 0 # Sort results by score, then by total battles sorted_results = sorted( leaderboard.items(), key=lambda x: (x[1]["score"], x[1]["wins"] + x[1]["losses"]), reverse=True ) # Explanation of the main leaderboard explanation = """

This leaderboard uses a scoring system that balances win rate and total battles. The score is calculated using the formula:
Score = Win Rate * (1 - 1 / (Total Battles + 1))
This formula rewards models with higher win rates and more battles. As the number of battles increases, the score approaches the win rate.

""" leaderboard_html = f""" {explanation} """ for index, (model, results) in enumerate(sorted_results, start=1): total_battles = results["wins"] + results["losses"] win_rate = (results["wins"] / total_battles * 100) if total_battles > 0 else 0 rank_display = {1: "🥇", 2: "🥈", 3: "🥉"}.get(index, f"{index}") top_rival = max(results["opponents"].items(), key=lambda x: x[1]["wins"], default=(None, {"wins": 0})) top_rival_name = get_human_readable_name(top_rival[0]) if top_rival[0] else "N/A" top_rival_wins = top_rival[1]["wins"] toughest_opponent = max(results["opponents"].items(), key=lambda x: x[1]["losses"], default=(None, {"losses": 0})) toughest_opponent_name = get_human_readable_name(toughest_opponent[0]) if toughest_opponent[0] else "N/A" toughest_opponent_losses = toughest_opponent[1]["losses"] leaderboard_html += f""" """ leaderboard_html += "
Rank Model Score Wins Losses Win Rate Total Battles Top Rival Toughest Opponent
{rank_display} {get_human_readable_name(model)} {results['score']:.4f} {results['wins']} {results['losses']} {win_rate:.2f}% {total_battles} {top_rival_name} (W: {top_rival_wins}) {toughest_opponent_name} (L: {toughest_opponent_losses})
" return leaderboard_html def calculate_elo_impact(model): positive_impact = 0 negative_impact = 0 leaderboard = load_leaderboard() initial_rating = 1000 + (get_model_size(model) * 100) for opponent, results in leaderboard[model]['opponents'].items(): model_size = get_model_size(model) opponent_size = get_model_size(opponent) max_size = max(get_model_size(m) for m, _ in arena_config.APPROVED_MODELS) size_difference = (opponent_size - model_size) / max_size win_impact = 1 + max(0, size_difference) loss_impact = 1 + max(0, -size_difference) positive_impact += results['wins'] * win_impact negative_impact += results['losses'] * loss_impact return round(positive_impact), round(negative_impact), round(initial_rating) def get_elo_leaderboard(): ensure_elo_ratings_initialized() leaderboard = load_leaderboard() sorted_ratings = sorted(elo_ratings.items(), key=lambda x: x[1], reverse=True) min_initial_rating = min(1000 + (get_model_size(model) * 100) for model, _ in arena_config.APPROVED_MODELS) max_initial_rating = max(1000 + (get_model_size(model) * 100) for model, _ in arena_config.APPROVED_MODELS) explanation_elo = f"""

This leaderboard uses a modified ELO rating system that takes into account both the performance and size of the models. Initial ratings range from {round(min_initial_rating)} to {round(max_initial_rating)} points, based on model size, with larger models starting at higher ratings. The "Positive Impact" score reflects the significance of wins, with higher scores for defeating larger models. The "Negative Impact" score indicates the significance of losses, with higher scores for losing against smaller models. The current ELO rating is calculated based on these impacts and the model's performance history.

""" leaderboard_html = f""" {explanation_elo} """ for index, (model, rating) in enumerate(sorted_ratings, start=1): total_battles = leaderboard[model]['wins'] + leaderboard[model]['losses'] rank_display = {1: "🥇", 2: "🥈", 3: "🥉"}.get(index, f"{index}") positive_impact, negative_impact, initial_rating = calculate_elo_impact(model) leaderboard_html += f""" """ leaderboard_html += "
Rank Model Current ELO Rating Positive Impact Negative Impact Total Battles Initial Rating
{rank_display} {get_human_readable_name(model)} {round(rating)} {positive_impact} {negative_impact} {total_battles} {initial_rating}
" return leaderboard_html def create_backup(): while True: try: leaderboard_data = load_leaderboard() timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") backup_file_name = f"leaderboard_backup_{timestamp}.json" backup_path = f"{arena_config.NEXTCLOUD_BACKUP_FOLDER}/{backup_file_name}" json_data = json.dumps(leaderboard_data, indent=2) nc.files.upload(backup_path, json_data.encode('utf-8')) print(f"Backup created on Nextcloud: {backup_path}") except Exception as e: print(f"Error creating backup: {e}") time.sleep(3600) # Sleep for 1 HOUR def start_backup_thread(): backup_thread = threading.Thread(target=create_backup, daemon=True) backup_thread.start() def get_leaderboard_chart(): battle_results = get_current_leaderboard() # Calculate scores and sort results for model, results in battle_results.items(): total_battles = results["wins"] + results["losses"] if total_battles > 0: win_rate = results["wins"] / total_battles results["score"] = win_rate * (1 - 1 / (total_battles + 1)) else: results["score"] = 0 sorted_results = sorted( battle_results.items(), key=lambda x: (x[1]["score"], x[1]["wins"] + x[1]["losses"]), reverse=True ) models = [get_human_readable_name(model) for model, _ in sorted_results] wins = [results["wins"] for _, results in sorted_results] losses = [results["losses"] for _, results in sorted_results] scores = [results["score"] for _, results in sorted_results] fig = go.Figure() # Stacked Bar chart for Wins and Losses fig.add_trace(go.Bar( x=models, y=wins, name='Wins', marker_color='#22577a' )) fig.add_trace(go.Bar( x=models, y=losses, name='Losses', marker_color='#38a3a5' )) # Line chart for Scores fig.add_trace(go.Scatter( x=models, y=scores, name='Score', yaxis='y2', line=dict(color='#ff7f0e', width=2) )) # Update layout for full-width, increased height, and secondary y-axis fig.update_layout( title='Model Performance', xaxis_title='Models', yaxis_title='Number of Battles', yaxis2=dict( title='Score', overlaying='y', side='right' ), barmode='stack', height=800, width=1450, autosize=True, legend=dict( orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1 ) ) return fig