Spaces:
Running
Running
from nc_py_api import Nextcloud | |
import json | |
from typing import Dict, Any | |
import os | |
import time | |
from datetime import datetime | |
import threading | |
import arena_config | |
import sys | |
import math | |
import plotly.graph_objects as go | |
# Initialize Nextcloud client | |
nc = Nextcloud(nextcloud_url=arena_config.NEXTCLOUD_URL, nc_auth_user=arena_config.NEXTCLOUD_USERNAME, nc_auth_pass=arena_config.NEXTCLOUD_PASSWORD) | |
# Dictionary to store ELO ratings | |
elo_ratings = {} | |
def load_leaderboard() -> Dict[str, Any]: | |
try: | |
file_content = nc.files.download(arena_config.NEXTCLOUD_LEADERBOARD_PATH) | |
return json.loads(file_content.decode('utf-8')) | |
except Exception as e: | |
print(f"Error loading leaderboard: {str(e)}") | |
return {} | |
def save_leaderboard(leaderboard_data: Dict[str, Any]) -> bool: | |
try: | |
json_data = json.dumps(leaderboard_data, indent=2) | |
nc.files.upload(arena_config.NEXTCLOUD_LEADERBOARD_PATH, json_data.encode('utf-8')) | |
return True | |
except Exception as e: | |
print(f"Error saving leaderboard: {str(e)}") | |
return False | |
def get_model_size(model_name): | |
for model, human_readable in arena_config.APPROVED_MODELS: | |
if model == model_name: | |
size = float(human_readable.split('(')[1].split('B')[0]) | |
return size | |
return 1.0 # Default size if not found | |
def calculate_expected_score(rating_a, rating_b): | |
return 1 / (1 + math.pow(10, (rating_b - rating_a) / 400)) | |
def update_elo_ratings(winner, loser): | |
if winner not in elo_ratings or loser not in elo_ratings: | |
initialize_elo_ratings() | |
winner_rating = elo_ratings[winner] | |
loser_rating = elo_ratings[loser] | |
expected_winner = calculate_expected_score(winner_rating, loser_rating) | |
expected_loser = 1 - expected_winner | |
winner_size = get_model_size(winner) | |
loser_size = get_model_size(loser) | |
max_size = max(get_model_size(model) for model, _ in arena_config.APPROVED_MODELS) | |
k_factor = min(64, 32 * (1 + (loser_size - winner_size) / max_size)) | |
elo_ratings[winner] += k_factor * (1 - expected_winner) | |
elo_ratings[loser] += k_factor * (0 - expected_loser) | |
def initialize_elo_ratings(): | |
leaderboard = load_leaderboard() | |
for model, _ in arena_config.APPROVED_MODELS: | |
size = get_model_size(model) | |
elo_ratings[model] = 1000 + (size * 100) | |
# Replay all battles to update ELO ratings | |
for model, data in leaderboard.items(): | |
if model not in elo_ratings: | |
elo_ratings[model] = 1000 + (get_model_size(model) * 100) | |
for opponent, results in data['opponents'].items(): | |
if opponent not in elo_ratings: | |
elo_ratings[opponent] = 1000 + (get_model_size(opponent) * 100) | |
for _ in range(results['wins']): | |
update_elo_ratings(model, opponent) | |
for _ in range(results['losses']): | |
update_elo_ratings(opponent, model) | |
def ensure_elo_ratings_initialized(): | |
if not elo_ratings: | |
initialize_elo_ratings() | |
def update_leaderboard(winner: str, loser: str) -> Dict[str, Any]: | |
leaderboard = load_leaderboard() | |
if winner not in leaderboard: | |
leaderboard[winner] = {"wins": 0, "losses": 0, "opponents": {}} | |
if loser not in leaderboard: | |
leaderboard[loser] = {"wins": 0, "losses": 0, "opponents": {}} | |
leaderboard[winner]["wins"] += 1 | |
leaderboard[winner]["opponents"].setdefault(loser, {"wins": 0, "losses": 0})["wins"] += 1 | |
leaderboard[loser]["losses"] += 1 | |
leaderboard[loser]["opponents"].setdefault(winner, {"wins": 0, "losses": 0})["losses"] += 1 | |
# Update ELO ratings | |
update_elo_ratings(winner, loser) | |
save_leaderboard(leaderboard) | |
return leaderboard | |
def get_current_leaderboard() -> Dict[str, Any]: | |
return load_leaderboard() | |
def get_human_readable_name(model_name: str) -> str: | |
model_dict = dict(arena_config.APPROVED_MODELS) | |
return model_dict.get(model_name, model_name) | |
def get_leaderboard(): | |
leaderboard = load_leaderboard() | |
# Calculate scores for each model | |
for model, results in leaderboard.items(): | |
total_battles = results["wins"] + results["losses"] | |
if total_battles > 0: | |
win_rate = results["wins"] / total_battles | |
results["score"] = win_rate * (1 - 1 / (total_battles + 1)) | |
else: | |
results["score"] = 0 | |
# Sort results by score, then by total battles | |
sorted_results = sorted( | |
leaderboard.items(), | |
key=lambda x: (x[1]["score"], x[1]["wins"] + x[1]["losses"]), | |
reverse=True | |
) | |
# Explanation of the main leaderboard | |
explanation = """ | |
<p style="font-size: 16px; margin-bottom: 20px;"> | |
This leaderboard uses a scoring system that balances win rate and total battles. The score is calculated using the formula: | |
<br> | |
<strong>Score = Win Rate * (1 - 1 / (Total Battles + 1))</strong> | |
<br> | |
This formula rewards models with higher win rates and more battles. As the number of battles increases, the score approaches the win rate. | |
</p> | |
""" | |
leaderboard_html = f""" | |
{explanation} | |
<style> | |
.leaderboard-table {{ | |
width: 100%; | |
border-collapse: collapse; | |
font-family: Arial, sans-serif; | |
}} | |
.leaderboard-table th, .leaderboard-table td {{ | |
border: 1px solid #ddd; | |
padding: 8px; | |
text-align: left; | |
}} | |
.leaderboard-table th {{ | |
background-color: rgba(255, 255, 255, 0.1); | |
font-weight: bold; | |
}} | |
.rank-column {{ | |
width: 60px; | |
text-align: center; | |
}} | |
.opponent-details {{ | |
font-size: 0.9em; | |
color: #888; | |
}} | |
</style> | |
<table class='leaderboard-table'> | |
<tr> | |
<th class='rank-column'>Rank</th> | |
<th>Model</th> | |
<th>Score</th> | |
<th>Wins</th> | |
<th>Losses</th> | |
<th>Win Rate</th> | |
<th>Total Battles</th> | |
<th>Top Rival</th> | |
<th>Toughest Opponent</th> | |
</tr> | |
""" | |
for index, (model, results) in enumerate(sorted_results, start=1): | |
total_battles = results["wins"] + results["losses"] | |
win_rate = (results["wins"] / total_battles * 100) if total_battles > 0 else 0 | |
rank_display = {1: "π₯", 2: "π₯", 3: "π₯"}.get(index, f"{index}") | |
top_rival = max(results["opponents"].items(), key=lambda x: x[1]["wins"], default=(None, {"wins": 0})) | |
top_rival_name = get_human_readable_name(top_rival[0]) if top_rival[0] else "N/A" | |
top_rival_wins = top_rival[1]["wins"] | |
toughest_opponent = max(results["opponents"].items(), key=lambda x: x[1]["losses"], default=(None, {"losses": 0})) | |
toughest_opponent_name = get_human_readable_name(toughest_opponent[0]) if toughest_opponent[0] else "N/A" | |
toughest_opponent_losses = toughest_opponent[1]["losses"] | |
leaderboard_html += f""" | |
<tr> | |
<td class='rank-column'>{rank_display}</td> | |
<td>{get_human_readable_name(model)}</td> | |
<td>{results['score']:.4f}</td> | |
<td>{results['wins']}</td> | |
<td>{results['losses']}</td> | |
<td>{win_rate:.2f}%</td> | |
<td>{total_battles}</td> | |
<td class='opponent-details'>{top_rival_name} (W: {top_rival_wins})</td> | |
<td class='opponent-details'>{toughest_opponent_name} (L: {toughest_opponent_losses})</td> | |
</tr> | |
""" | |
leaderboard_html += "</table>" | |
return leaderboard_html | |
def calculate_elo_impact(model): | |
positive_impact = 0 | |
negative_impact = 0 | |
leaderboard = load_leaderboard() | |
initial_rating = 1000 + (get_model_size(model) * 100) | |
if model in leaderboard: | |
for opponent, results in leaderboard[model]['opponents'].items(): | |
model_size = get_model_size(model) | |
opponent_size = get_model_size(opponent) | |
max_size = max(get_model_size(m) for m, _ in arena_config.APPROVED_MODELS) | |
size_difference = (opponent_size - model_size) / max_size | |
win_impact = 1 + max(0, size_difference) | |
loss_impact = 1 + max(0, -size_difference) | |
positive_impact += results['wins'] * win_impact | |
negative_impact += results['losses'] * loss_impact | |
return round(positive_impact), round(negative_impact), round(initial_rating) | |
def get_elo_leaderboard(): | |
ensure_elo_ratings_initialized() | |
leaderboard = load_leaderboard() | |
# Create a list of all models, including those from APPROVED_MODELS that might not be in the leaderboard yet | |
all_models = set(dict(arena_config.APPROVED_MODELS).keys()) | set(leaderboard.keys()) | |
elo_data = [] | |
for model in all_models: | |
initial_rating = 1000 + (get_model_size(model) * 100) | |
current_rating = elo_ratings.get(model, initial_rating) | |
# Calculate battle data only if the model exists in the leaderboard | |
if model in leaderboard: | |
wins = leaderboard[model].get('wins', 0) | |
losses = leaderboard[model].get('losses', 0) | |
total_battles = wins + losses | |
positive_impact, negative_impact, _ = calculate_elo_impact(model) | |
else: | |
wins = losses = total_battles = positive_impact = negative_impact = 0 | |
elo_data.append({ | |
'model': model, | |
'current_rating': current_rating, | |
'initial_rating': initial_rating, | |
'total_battles': total_battles, | |
'positive_impact': positive_impact, | |
'negative_impact': negative_impact | |
}) | |
# Sort the data by current rating | |
sorted_elo_data = sorted(elo_data, key=lambda x: x['current_rating'], reverse=True) | |
min_initial_rating = min(data['initial_rating'] for data in elo_data) | |
max_initial_rating = max(data['initial_rating'] for data in elo_data) | |
explanation_elo = f""" | |
<p style="font-size: 16px; margin-bottom: 20px;"> | |
This leaderboard uses a modified ELO rating system that takes into account both the performance and size of the models. | |
Initial ratings range from {round(min_initial_rating)} to {round(max_initial_rating)} points, based on model size, with larger models starting at higher ratings. | |
The "Positive Impact" score reflects the significance of wins, with higher scores for defeating larger models. | |
The "Negative Impact" score indicates the significance of losses, with higher scores for losing against smaller models. | |
The current ELO rating is calculated based on these impacts and the model's performance history. | |
</p> | |
""" | |
leaderboard_html = f""" | |
{explanation_elo} | |
<style> | |
.elo-leaderboard-table {{ | |
width: 100%; | |
border-collapse: collapse; | |
font-family: Arial, sans-serif; | |
}} | |
.elo-leaderboard-table th, .elo-leaderboard-table td {{ | |
border: 1px solid #ddd; | |
padding: 8px; | |
text-align: left; | |
}} | |
.elo-leaderboard-table th {{ | |
background-color: rgba(255, 255, 255, 0.1); | |
font-weight: bold; | |
}} | |
.rank-column {{ | |
width: 60px; | |
text-align: center; | |
}} | |
</style> | |
<table class='elo-leaderboard-table'> | |
<tr> | |
<th class='rank-column'>Rank</th> | |
<th>Model</th> | |
<th>Current ELO Rating</th> | |
<th>Positive Impact</th> | |
<th>Negative Impact</th> | |
<th>Total Battles</th> | |
<th>Initial Rating</th> | |
</tr> | |
""" | |
for index, data in enumerate(sorted_elo_data, start=1): | |
rank_display = {1: "π₯", 2: "π₯", 3: "π₯"}.get(index, f"{index}") | |
leaderboard_html += f""" | |
<tr> | |
<td class='rank-column'>{rank_display}</td> | |
<td>{get_human_readable_name(data['model'])}</td> | |
<td><strong>{round(data['current_rating'])}</strong></td> | |
<td>{data['positive_impact']}</td> | |
<td>{data['negative_impact']}</td> | |
<td>{data['total_battles']}</td> | |
<td>{round(data['initial_rating'])}</td> | |
</tr> | |
""" | |
leaderboard_html += "</table>" | |
return leaderboard_html | |
def create_backup(): | |
while True: | |
try: | |
leaderboard_data = load_leaderboard() | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
backup_file_name = f"leaderboard_backup_{timestamp}.json" | |
backup_path = f"{arena_config.NEXTCLOUD_BACKUP_FOLDER}/{backup_file_name}" | |
json_data = json.dumps(leaderboard_data, indent=2) | |
nc.files.upload(backup_path, json_data.encode('utf-8')) | |
print(f"Backup created on Nextcloud: {backup_path}") | |
except Exception as e: | |
print(f"Error creating backup: {e}") | |
time.sleep(3600) # Sleep for 1 HOUR | |
def start_backup_thread(): | |
backup_thread = threading.Thread(target=create_backup, daemon=True) | |
backup_thread.start() | |
def get_leaderboard_chart(): | |
battle_results = get_current_leaderboard() | |
# Calculate scores and sort results | |
for model, results in battle_results.items(): | |
total_battles = results["wins"] + results["losses"] | |
if total_battles > 0: | |
win_rate = results["wins"] / total_battles | |
results["score"] = win_rate * (1 - 1 / (total_battles + 1)) | |
else: | |
results["score"] = 0 | |
sorted_results = sorted( | |
battle_results.items(), | |
key=lambda x: (x[1]["score"], x[1]["wins"] + x[1]["losses"]), | |
reverse=True | |
) | |
models = [get_human_readable_name(model) for model, _ in sorted_results] | |
wins = [results["wins"] for _, results in sorted_results] | |
losses = [results["losses"] for _, results in sorted_results] | |
scores = [results["score"] for _, results in sorted_results] | |
fig = go.Figure() | |
# Stacked Bar chart for Wins and Losses | |
fig.add_trace(go.Bar( | |
x=models, | |
y=wins, | |
name='Wins', | |
marker_color='#22577a' | |
)) | |
fig.add_trace(go.Bar( | |
x=models, | |
y=losses, | |
name='Losses', | |
marker_color='#38a3a5' | |
)) | |
# Line chart for Scores | |
fig.add_trace(go.Scatter( | |
x=models, | |
y=scores, | |
name='Score', | |
yaxis='y2', | |
line=dict(color='#ff7f0e', width=2) | |
)) | |
# Update layout for full-width, increased height, and secondary y-axis | |
fig.update_layout( | |
title='Model Performance', | |
xaxis_title='Models', | |
yaxis_title='Number of Battles', | |
yaxis2=dict( | |
title='Score', | |
overlaying='y', | |
side='right' | |
), | |
barmode='stack', | |
height=800, | |
width=1450, | |
autosize=True, | |
legend=dict( | |
orientation='h', | |
yanchor='bottom', | |
y=1.02, | |
xanchor='right', | |
x=1 | |
) | |
) | |
return fig | |