Spaces:
Running
Running
Update leaderboard.py
Browse files- leaderboard.py +112 -29
leaderboard.py
CHANGED
@@ -8,6 +8,7 @@ import threading
|
|
8 |
import arena_config
|
9 |
import sys
|
10 |
import math
|
|
|
11 |
|
12 |
# Initialize Nextcloud client
|
13 |
nc = Nextcloud(nextcloud_url=arena_config.NEXTCLOUD_URL, nc_auth_user=arena_config.NEXTCLOUD_USERNAME, nc_auth_pass=arena_config.NEXTCLOUD_PASSWORD)
|
@@ -56,7 +57,7 @@ def update_elo_ratings(winner, loser):
|
|
56 |
loser_size = get_model_size(loser)
|
57 |
max_size = max(get_model_size(model) for model, _ in arena_config.APPROVED_MODELS)
|
58 |
|
59 |
-
k_factor = 32 * (1 + (loser_size - winner_size) / max_size)
|
60 |
|
61 |
elo_ratings[winner] += k_factor * (1 - expected_winner)
|
62 |
elo_ratings[loser] += k_factor * (0 - expected_loser)
|
@@ -205,6 +206,27 @@ def get_leaderboard():
|
|
205 |
leaderboard_html += "</table>"
|
206 |
return leaderboard_html
|
207 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
def get_elo_leaderboard():
|
209 |
ensure_elo_ratings_initialized()
|
210 |
leaderboard = load_leaderboard()
|
@@ -217,9 +239,9 @@ def get_elo_leaderboard():
|
|
217 |
<p style="font-size: 16px; margin-bottom: 20px;">
|
218 |
This leaderboard uses a modified ELO rating system that takes into account both the performance and size of the models.
|
219 |
Initial ratings range from {round(min_initial_rating)} to {round(max_initial_rating)} points, based on model size, with larger models starting at higher ratings.
|
220 |
-
|
221 |
-
The "
|
222 |
-
The
|
223 |
</p>
|
224 |
"""
|
225 |
|
@@ -249,41 +271,29 @@ def get_elo_leaderboard():
|
|
249 |
<tr>
|
250 |
<th class='rank-column'>Rank</th>
|
251 |
<th>Model</th>
|
252 |
-
<th>ELO Rating</th>
|
253 |
-
<th>
|
254 |
-
<th>
|
|
|
|
|
|
|
255 |
</tr>
|
256 |
"""
|
257 |
|
258 |
for index, (model, rating) in enumerate(sorted_ratings, start=1):
|
|
|
259 |
rank_display = {1: "π₯", 2: "π₯", 3: "π₯"}.get(index, f"{index}")
|
260 |
-
|
261 |
-
|
262 |
-
points_scored = 0
|
263 |
-
points_lost = 0
|
264 |
-
if model in leaderboard:
|
265 |
-
for opponent, results in leaderboard[model]['opponents'].items():
|
266 |
-
opponent_rating = elo_ratings.get(opponent, 1000)
|
267 |
-
opponent_size = get_model_size(opponent)
|
268 |
-
max_size = max(get_model_size(m) for m, _ in arena_config.APPROVED_MODELS)
|
269 |
-
|
270 |
-
for _ in range(results['wins']):
|
271 |
-
expected_score = calculate_expected_score(rating, opponent_rating)
|
272 |
-
k_factor = 32 * (1 + (opponent_size - model_size) / max_size)
|
273 |
-
points_scored += k_factor * (1 - expected_score)
|
274 |
-
|
275 |
-
for _ in range(results['losses']):
|
276 |
-
expected_score = calculate_expected_score(rating, opponent_rating)
|
277 |
-
k_factor = 32 * (1 + (model_size - opponent_size) / max_size)
|
278 |
-
points_lost += k_factor * expected_score
|
279 |
|
280 |
leaderboard_html += f"""
|
281 |
<tr>
|
282 |
<td class='rank-column'>{rank_display}</td>
|
283 |
<td>{get_human_readable_name(model)}</td>
|
284 |
-
<td>{round(rating)}</td>
|
285 |
-
<td>{
|
286 |
-
<td>{
|
|
|
|
|
287 |
</tr>
|
288 |
"""
|
289 |
|
@@ -307,3 +317,76 @@ def create_backup():
|
|
307 |
def start_backup_thread():
|
308 |
backup_thread = threading.Thread(target=create_backup, daemon=True)
|
309 |
backup_thread.start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
import arena_config
|
9 |
import sys
|
10 |
import math
|
11 |
+
import plotly.graph_objects as go
|
12 |
|
13 |
# Initialize Nextcloud client
|
14 |
nc = Nextcloud(nextcloud_url=arena_config.NEXTCLOUD_URL, nc_auth_user=arena_config.NEXTCLOUD_USERNAME, nc_auth_pass=arena_config.NEXTCLOUD_PASSWORD)
|
|
|
57 |
loser_size = get_model_size(loser)
|
58 |
max_size = max(get_model_size(model) for model, _ in arena_config.APPROVED_MODELS)
|
59 |
|
60 |
+
k_factor = min(64, 32 * (1 + (loser_size - winner_size) / max_size))
|
61 |
|
62 |
elo_ratings[winner] += k_factor * (1 - expected_winner)
|
63 |
elo_ratings[loser] += k_factor * (0 - expected_loser)
|
|
|
206 |
leaderboard_html += "</table>"
|
207 |
return leaderboard_html
|
208 |
|
209 |
+
def calculate_elo_impact(model):
|
210 |
+
positive_impact = 0
|
211 |
+
negative_impact = 0
|
212 |
+
leaderboard = load_leaderboard()
|
213 |
+
initial_rating = 1000 + (get_model_size(model) * 100)
|
214 |
+
|
215 |
+
for opponent, results in leaderboard[model]['opponents'].items():
|
216 |
+
model_size = get_model_size(model)
|
217 |
+
opponent_size = get_model_size(opponent)
|
218 |
+
max_size = max(get_model_size(m) for m, _ in arena_config.APPROVED_MODELS)
|
219 |
+
|
220 |
+
size_difference = (opponent_size - model_size) / max_size
|
221 |
+
|
222 |
+
win_impact = 1 + max(0, size_difference)
|
223 |
+
loss_impact = 1 + max(0, -size_difference)
|
224 |
+
|
225 |
+
positive_impact += results['wins'] * win_impact
|
226 |
+
negative_impact += results['losses'] * loss_impact
|
227 |
+
|
228 |
+
return round(positive_impact), round(negative_impact), round(initial_rating)
|
229 |
+
|
230 |
def get_elo_leaderboard():
|
231 |
ensure_elo_ratings_initialized()
|
232 |
leaderboard = load_leaderboard()
|
|
|
239 |
<p style="font-size: 16px; margin-bottom: 20px;">
|
240 |
This leaderboard uses a modified ELO rating system that takes into account both the performance and size of the models.
|
241 |
Initial ratings range from {round(min_initial_rating)} to {round(max_initial_rating)} points, based on model size, with larger models starting at higher ratings.
|
242 |
+
The "Positive Impact" score reflects the significance of wins, with higher scores for defeating larger models.
|
243 |
+
The "Negative Impact" score indicates the significance of losses, with higher scores for losing against smaller models.
|
244 |
+
The current ELO rating is calculated based on these impacts and the model's performance history.
|
245 |
</p>
|
246 |
"""
|
247 |
|
|
|
271 |
<tr>
|
272 |
<th class='rank-column'>Rank</th>
|
273 |
<th>Model</th>
|
274 |
+
<th>Current ELO Rating</th>
|
275 |
+
<th>Positive Impact</th>
|
276 |
+
<th>Negative Impact</th>
|
277 |
+
<th>Total Battles</th>
|
278 |
+
<th>Initial Rating</th>
|
279 |
+
|
280 |
</tr>
|
281 |
"""
|
282 |
|
283 |
for index, (model, rating) in enumerate(sorted_ratings, start=1):
|
284 |
+
total_battles = leaderboard[model]['wins'] + leaderboard[model]['losses']
|
285 |
rank_display = {1: "π₯", 2: "π₯", 3: "π₯"}.get(index, f"{index}")
|
286 |
+
positive_impact, negative_impact, initial_rating = calculate_elo_impact(model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
|
288 |
leaderboard_html += f"""
|
289 |
<tr>
|
290 |
<td class='rank-column'>{rank_display}</td>
|
291 |
<td>{get_human_readable_name(model)}</td>
|
292 |
+
<td><strong>{round(rating)}</strong></td>
|
293 |
+
<td>{positive_impact}</td>
|
294 |
+
<td>{negative_impact}</td>
|
295 |
+
<td>{total_battles}</td>
|
296 |
+
<td>{initial_rating}</td>
|
297 |
</tr>
|
298 |
"""
|
299 |
|
|
|
317 |
def start_backup_thread():
|
318 |
backup_thread = threading.Thread(target=create_backup, daemon=True)
|
319 |
backup_thread.start()
|
320 |
+
|
321 |
+
def get_leaderboard_chart():
|
322 |
+
battle_results = get_current_leaderboard()
|
323 |
+
|
324 |
+
# Calculate scores and sort results
|
325 |
+
for model, results in battle_results.items():
|
326 |
+
total_battles = results["wins"] + results["losses"]
|
327 |
+
if total_battles > 0:
|
328 |
+
win_rate = results["wins"] / total_battles
|
329 |
+
results["score"] = win_rate * (1 - 1 / (total_battles + 1))
|
330 |
+
else:
|
331 |
+
results["score"] = 0
|
332 |
+
|
333 |
+
sorted_results = sorted(
|
334 |
+
battle_results.items(),
|
335 |
+
key=lambda x: (x[1]["score"], x[1]["wins"] + x[1]["losses"]),
|
336 |
+
reverse=True
|
337 |
+
)
|
338 |
+
|
339 |
+
models = [get_human_readable_name(model) for model, _ in sorted_results]
|
340 |
+
wins = [results["wins"] for _, results in sorted_results]
|
341 |
+
losses = [results["losses"] for _, results in sorted_results]
|
342 |
+
scores = [results["score"] for _, results in sorted_results]
|
343 |
+
|
344 |
+
fig = go.Figure()
|
345 |
+
|
346 |
+
# Stacked Bar chart for Wins and Losses
|
347 |
+
fig.add_trace(go.Bar(
|
348 |
+
x=models,
|
349 |
+
y=wins,
|
350 |
+
name='Wins',
|
351 |
+
marker_color='#22577a'
|
352 |
+
))
|
353 |
+
fig.add_trace(go.Bar(
|
354 |
+
x=models,
|
355 |
+
y=losses,
|
356 |
+
name='Losses',
|
357 |
+
marker_color='#38a3a5'
|
358 |
+
))
|
359 |
+
|
360 |
+
# Line chart for Scores
|
361 |
+
fig.add_trace(go.Scatter(
|
362 |
+
x=models,
|
363 |
+
y=scores,
|
364 |
+
name='Score',
|
365 |
+
yaxis='y2',
|
366 |
+
line=dict(color='#ff7f0e', width=2)
|
367 |
+
))
|
368 |
+
|
369 |
+
# Update layout for full-width, increased height, and secondary y-axis
|
370 |
+
fig.update_layout(
|
371 |
+
title='Model Performance',
|
372 |
+
xaxis_title='Models',
|
373 |
+
yaxis_title='Number of Battles',
|
374 |
+
yaxis2=dict(
|
375 |
+
title='Score',
|
376 |
+
overlaying='y',
|
377 |
+
side='right'
|
378 |
+
),
|
379 |
+
barmode='stack',
|
380 |
+
height=800,
|
381 |
+
width=1450,
|
382 |
+
autosize=True,
|
383 |
+
legend=dict(
|
384 |
+
orientation='h',
|
385 |
+
yanchor='bottom',
|
386 |
+
y=1.02,
|
387 |
+
xanchor='right',
|
388 |
+
x=1
|
389 |
+
)
|
390 |
+
)
|
391 |
+
|
392 |
+
return fig
|