k-mktr commited on
Commit
d6f8bd2
β€’
1 Parent(s): 264e6cc

Update leaderboard.py

Browse files
Files changed (1) hide show
  1. leaderboard.py +112 -29
leaderboard.py CHANGED
@@ -8,6 +8,7 @@ import threading
8
  import arena_config
9
  import sys
10
  import math
 
11
 
12
  # Initialize Nextcloud client
13
  nc = Nextcloud(nextcloud_url=arena_config.NEXTCLOUD_URL, nc_auth_user=arena_config.NEXTCLOUD_USERNAME, nc_auth_pass=arena_config.NEXTCLOUD_PASSWORD)
@@ -56,7 +57,7 @@ def update_elo_ratings(winner, loser):
56
  loser_size = get_model_size(loser)
57
  max_size = max(get_model_size(model) for model, _ in arena_config.APPROVED_MODELS)
58
 
59
- k_factor = 32 * (1 + (loser_size - winner_size) / max_size)
60
 
61
  elo_ratings[winner] += k_factor * (1 - expected_winner)
62
  elo_ratings[loser] += k_factor * (0 - expected_loser)
@@ -205,6 +206,27 @@ def get_leaderboard():
205
  leaderboard_html += "</table>"
206
  return leaderboard_html
207
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  def get_elo_leaderboard():
209
  ensure_elo_ratings_initialized()
210
  leaderboard = load_leaderboard()
@@ -217,9 +239,9 @@ def get_elo_leaderboard():
217
  <p style="font-size: 16px; margin-bottom: 20px;">
218
  This leaderboard uses a modified ELO rating system that takes into account both the performance and size of the models.
219
  Initial ratings range from {round(min_initial_rating)} to {round(max_initial_rating)} points, based on model size, with larger models starting at higher ratings.
220
- When a smaller model defeats a larger one, it gains more points, while larger models gain fewer points for beating smaller ones.
221
- The "Points Scored" column shows the total ELO points gained by the model from its victories, reflecting both quantity and quality of wins.
222
- The "Points Lost" column shows the total ELO points lost by the model from its defeats, indicating the challenges faced.
223
  </p>
224
  """
225
 
@@ -249,41 +271,29 @@ def get_elo_leaderboard():
249
  <tr>
250
  <th class='rank-column'>Rank</th>
251
  <th>Model</th>
252
- <th>ELO Rating</th>
253
- <th>Points Scored</th>
254
- <th>Points Lost</th>
 
 
 
255
  </tr>
256
  """
257
 
258
  for index, (model, rating) in enumerate(sorted_ratings, start=1):
 
259
  rank_display = {1: "πŸ₯‡", 2: "πŸ₯ˆ", 3: "πŸ₯‰"}.get(index, f"{index}")
260
- model_size = get_model_size(model)
261
-
262
- points_scored = 0
263
- points_lost = 0
264
- if model in leaderboard:
265
- for opponent, results in leaderboard[model]['opponents'].items():
266
- opponent_rating = elo_ratings.get(opponent, 1000)
267
- opponent_size = get_model_size(opponent)
268
- max_size = max(get_model_size(m) for m, _ in arena_config.APPROVED_MODELS)
269
-
270
- for _ in range(results['wins']):
271
- expected_score = calculate_expected_score(rating, opponent_rating)
272
- k_factor = 32 * (1 + (opponent_size - model_size) / max_size)
273
- points_scored += k_factor * (1 - expected_score)
274
-
275
- for _ in range(results['losses']):
276
- expected_score = calculate_expected_score(rating, opponent_rating)
277
- k_factor = 32 * (1 + (model_size - opponent_size) / max_size)
278
- points_lost += k_factor * expected_score
279
 
280
  leaderboard_html += f"""
281
  <tr>
282
  <td class='rank-column'>{rank_display}</td>
283
  <td>{get_human_readable_name(model)}</td>
284
- <td>{round(rating)}</td>
285
- <td>{round(points_scored, 2)}</td>
286
- <td>{round(points_lost, 2)}</td>
 
 
287
  </tr>
288
  """
289
 
@@ -307,3 +317,76 @@ def create_backup():
307
  def start_backup_thread():
308
  backup_thread = threading.Thread(target=create_backup, daemon=True)
309
  backup_thread.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import arena_config
9
  import sys
10
  import math
11
+ import plotly.graph_objects as go
12
 
13
  # Initialize Nextcloud client
14
  nc = Nextcloud(nextcloud_url=arena_config.NEXTCLOUD_URL, nc_auth_user=arena_config.NEXTCLOUD_USERNAME, nc_auth_pass=arena_config.NEXTCLOUD_PASSWORD)
 
57
  loser_size = get_model_size(loser)
58
  max_size = max(get_model_size(model) for model, _ in arena_config.APPROVED_MODELS)
59
 
60
+ k_factor = min(64, 32 * (1 + (loser_size - winner_size) / max_size))
61
 
62
  elo_ratings[winner] += k_factor * (1 - expected_winner)
63
  elo_ratings[loser] += k_factor * (0 - expected_loser)
 
206
  leaderboard_html += "</table>"
207
  return leaderboard_html
208
 
209
+ def calculate_elo_impact(model):
210
+ positive_impact = 0
211
+ negative_impact = 0
212
+ leaderboard = load_leaderboard()
213
+ initial_rating = 1000 + (get_model_size(model) * 100)
214
+
215
+ for opponent, results in leaderboard[model]['opponents'].items():
216
+ model_size = get_model_size(model)
217
+ opponent_size = get_model_size(opponent)
218
+ max_size = max(get_model_size(m) for m, _ in arena_config.APPROVED_MODELS)
219
+
220
+ size_difference = (opponent_size - model_size) / max_size
221
+
222
+ win_impact = 1 + max(0, size_difference)
223
+ loss_impact = 1 + max(0, -size_difference)
224
+
225
+ positive_impact += results['wins'] * win_impact
226
+ negative_impact += results['losses'] * loss_impact
227
+
228
+ return round(positive_impact), round(negative_impact), round(initial_rating)
229
+
230
  def get_elo_leaderboard():
231
  ensure_elo_ratings_initialized()
232
  leaderboard = load_leaderboard()
 
239
  <p style="font-size: 16px; margin-bottom: 20px;">
240
  This leaderboard uses a modified ELO rating system that takes into account both the performance and size of the models.
241
  Initial ratings range from {round(min_initial_rating)} to {round(max_initial_rating)} points, based on model size, with larger models starting at higher ratings.
242
+ The "Positive Impact" score reflects the significance of wins, with higher scores for defeating larger models.
243
+ The "Negative Impact" score indicates the significance of losses, with higher scores for losing against smaller models.
244
+ The current ELO rating is calculated based on these impacts and the model's performance history.
245
  </p>
246
  """
247
 
 
271
  <tr>
272
  <th class='rank-column'>Rank</th>
273
  <th>Model</th>
274
+ <th>Current ELO Rating</th>
275
+ <th>Positive Impact</th>
276
+ <th>Negative Impact</th>
277
+ <th>Total Battles</th>
278
+ <th>Initial Rating</th>
279
+
280
  </tr>
281
  """
282
 
283
  for index, (model, rating) in enumerate(sorted_ratings, start=1):
284
+ total_battles = leaderboard[model]['wins'] + leaderboard[model]['losses']
285
  rank_display = {1: "πŸ₯‡", 2: "πŸ₯ˆ", 3: "πŸ₯‰"}.get(index, f"{index}")
286
+ positive_impact, negative_impact, initial_rating = calculate_elo_impact(model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
 
288
  leaderboard_html += f"""
289
  <tr>
290
  <td class='rank-column'>{rank_display}</td>
291
  <td>{get_human_readable_name(model)}</td>
292
+ <td><strong>{round(rating)}</strong></td>
293
+ <td>{positive_impact}</td>
294
+ <td>{negative_impact}</td>
295
+ <td>{total_battles}</td>
296
+ <td>{initial_rating}</td>
297
  </tr>
298
  """
299
 
 
317
  def start_backup_thread():
318
  backup_thread = threading.Thread(target=create_backup, daemon=True)
319
  backup_thread.start()
320
+
321
+ def get_leaderboard_chart():
322
+ battle_results = get_current_leaderboard()
323
+
324
+ # Calculate scores and sort results
325
+ for model, results in battle_results.items():
326
+ total_battles = results["wins"] + results["losses"]
327
+ if total_battles > 0:
328
+ win_rate = results["wins"] / total_battles
329
+ results["score"] = win_rate * (1 - 1 / (total_battles + 1))
330
+ else:
331
+ results["score"] = 0
332
+
333
+ sorted_results = sorted(
334
+ battle_results.items(),
335
+ key=lambda x: (x[1]["score"], x[1]["wins"] + x[1]["losses"]),
336
+ reverse=True
337
+ )
338
+
339
+ models = [get_human_readable_name(model) for model, _ in sorted_results]
340
+ wins = [results["wins"] for _, results in sorted_results]
341
+ losses = [results["losses"] for _, results in sorted_results]
342
+ scores = [results["score"] for _, results in sorted_results]
343
+
344
+ fig = go.Figure()
345
+
346
+ # Stacked Bar chart for Wins and Losses
347
+ fig.add_trace(go.Bar(
348
+ x=models,
349
+ y=wins,
350
+ name='Wins',
351
+ marker_color='#22577a'
352
+ ))
353
+ fig.add_trace(go.Bar(
354
+ x=models,
355
+ y=losses,
356
+ name='Losses',
357
+ marker_color='#38a3a5'
358
+ ))
359
+
360
+ # Line chart for Scores
361
+ fig.add_trace(go.Scatter(
362
+ x=models,
363
+ y=scores,
364
+ name='Score',
365
+ yaxis='y2',
366
+ line=dict(color='#ff7f0e', width=2)
367
+ ))
368
+
369
+ # Update layout for full-width, increased height, and secondary y-axis
370
+ fig.update_layout(
371
+ title='Model Performance',
372
+ xaxis_title='Models',
373
+ yaxis_title='Number of Battles',
374
+ yaxis2=dict(
375
+ title='Score',
376
+ overlaying='y',
377
+ side='right'
378
+ ),
379
+ barmode='stack',
380
+ height=800,
381
+ width=1450,
382
+ autosize=True,
383
+ legend=dict(
384
+ orientation='h',
385
+ yanchor='bottom',
386
+ y=1.02,
387
+ xanchor='right',
388
+ x=1
389
+ )
390
+ )
391
+
392
+ return fig