rusticluftig commited on
Commit
92ec2a2
·
1 Parent(s): 838067a

Update LB for INSTRUCT_8B comp

Browse files
Files changed (3) hide show
  1. app.py +47 -56
  2. competitions.py +10 -0
  3. utils.py +21 -7
app.py CHANGED
@@ -4,10 +4,10 @@ import datetime
4
  import os
5
 
6
  import gradio as gr
 
7
  from apscheduler.schedulers.background import BackgroundScheduler
8
  from dotenv import load_dotenv
9
  from huggingface_hub import HfApi
10
- import matplotlib.pyplot as plt
11
 
12
  import competitions
13
  import utils
@@ -57,7 +57,6 @@ def main():
57
  validator_df = state_vars["validator_df"]
58
  benchmarks_df = state_vars["benchmarks_df"]
59
  benchmarks_targets = state_vars["benchmarks_targets"]
60
- losses_2 = state_vars["losses_2"]
61
 
62
  demo = gr.Blocks(css=".typewriter {font-family: 'JMH Typewriter', sans-serif;}")
63
  with demo:
@@ -75,58 +74,41 @@ def main():
75
  num_top_classes=10,
76
  )
77
 
 
78
  with gr.Accordion("Competition Results"):
79
  gr.HTML(EVALUATION_HEADER)
80
  show_stale = gr.Checkbox(label="Show Stale", interactive=True)
81
  competition_leaderboards = []
82
- comp_2 = competitions.COMPETITION_DETAILS[2]
83
- with gr.Accordion(f"{comp_2.name} Competition"):
84
- gr.HTML(comp_2.html_description)
85
- competition_leaderboards.append(
86
- gr.components.Dataframe(
87
- value=utils.leaderboard_data(
88
- model_data, scores, 2, show_stale.value
89
- ),
90
- headers=[
91
- "Name",
92
- "Win Rate",
93
- "Score",
94
- "Weight",
95
- "UID",
96
- "Block",
97
- ],
98
- datatype=[
99
- "markdown",
100
- "number",
101
- "number",
102
- "number",
103
- "number",
104
- "number",
105
- ],
106
- elem_id="comp2-table",
107
- interactive=False,
108
- visible=True,
 
 
109
  )
110
- )
111
- gr.LinePlot(
112
- losses_2,
113
- x="timestamp",
114
- x_title="Date",
115
- y="losses",
116
- y_title="Score",
117
- interactive=True,
118
- visible=True,
119
- width=1024,
120
- title="Best Score Over Time",
121
- )
122
- gr.HTML(
123
- """
124
- The definition of score changes over time as new evaluation tasks are added in releases.
125
- <ul>
126
- <li><b>Start-Oct 27</b>: % of wrong answers on synthetic MMLU</li>
127
- <li><b>Oct 27-Now</b>: + word sorting eval</li>
128
- """
129
- )
130
  gr.HTML(
131
  """
132
  <ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
@@ -137,17 +119,23 @@ def main():
137
  <li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>."""
138
  )
139
  show_stale.change(
140
- lambda stale: [utils.leaderboard_data(model_data, scores, 2, stale)],
 
 
 
141
  inputs=[show_stale],
142
  outputs=competition_leaderboards,
143
  )
144
 
145
  if benchmarks_df is not None:
146
 
147
- def create_benchmark_plot(benchmark: str):
148
  fig = plt.figure(figsize=(10, 8))
149
 
150
- plt.plot(benchmarks_df["timestamp"], benchmarks_df[benchmark])
 
 
 
151
 
152
  # Adding horizontal dotted lines for various benchmark targets (well-known models)
153
  for model, score in benchmarks_targets[benchmark].items():
@@ -169,10 +157,13 @@ def main():
169
  return fig
170
 
171
  with gr.Accordion("Top Model Benchmarks"):
172
- mmlu = create_benchmark_plot("mmlu")
173
- mmlu_pro = create_benchmark_plot("mmlu_pro")
174
- gr.Plot(mmlu)
175
- gr.Plot(mmlu_pro)
 
 
 
176
  gr.HTML(
177
  """<div>Benchmarks computed using <a href='https://github.com/EleutherAI/lm-evaluation-harness'>lm-eval harness</a></div>"""
178
  )
 
4
  import os
5
 
6
  import gradio as gr
7
+ import matplotlib.pyplot as plt
8
  from apscheduler.schedulers.background import BackgroundScheduler
9
  from dotenv import load_dotenv
10
  from huggingface_hub import HfApi
 
11
 
12
  import competitions
13
  import utils
 
57
  validator_df = state_vars["validator_df"]
58
  benchmarks_df = state_vars["benchmarks_df"]
59
  benchmarks_targets = state_vars["benchmarks_targets"]
 
60
 
61
  demo = gr.Blocks(css=".typewriter {font-family: 'JMH Typewriter', sans-serif;}")
62
  with demo:
 
74
  num_top_classes=10,
75
  )
76
 
77
+ comp_ids = [2, 3]
78
  with gr.Accordion("Competition Results"):
79
  gr.HTML(EVALUATION_HEADER)
80
  show_stale = gr.Checkbox(label="Show Stale", interactive=True)
81
  competition_leaderboards = []
82
+ for comp_id in comp_ids:
83
+ details = competitions.COMPETITION_DETAILS[comp_id]
84
+ with gr.Accordion(f"{details.name} Competition"):
85
+ gr.HTML(details.html_description)
86
+ competition_leaderboards.append(
87
+ gr.components.Dataframe(
88
+ value=utils.leaderboard_data(
89
+ model_data, scores, comp_id, show_stale.value
90
+ ),
91
+ headers=[
92
+ "Name",
93
+ "Win Rate",
94
+ "Score",
95
+ "Weight",
96
+ "UID",
97
+ "Block",
98
+ ],
99
+ datatype=[
100
+ "markdown",
101
+ "number",
102
+ "number",
103
+ "number",
104
+ "number",
105
+ "number",
106
+ ],
107
+ elem_id=f"comp{comp_id}-table",
108
+ interactive=False,
109
+ visible=True,
110
+ )
111
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  gr.HTML(
113
  """
114
  <ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
 
119
  <li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>."""
120
  )
121
  show_stale.change(
122
+ lambda stale: [
123
+ utils.leaderboard_data(model_data, scores, id, stale)
124
+ for id in comp_ids
125
+ ],
126
  inputs=[show_stale],
127
  outputs=competition_leaderboards,
128
  )
129
 
130
  if benchmarks_df is not None:
131
 
132
+ def create_benchmark_plot(benchmark: str, comp_id: int):
133
  fig = plt.figure(figsize=(10, 8))
134
 
135
+ # Filter to just entries for this competition.
136
+ df = benchmarks_df[benchmarks_df["competition_id"] == comp_id]
137
+
138
+ plt.plot(df["timestamp"], df[benchmark])
139
 
140
  # Adding horizontal dotted lines for various benchmark targets (well-known models)
141
  for model, score in benchmarks_targets[benchmark].items():
 
157
  return fig
158
 
159
  with gr.Accordion("Top Model Benchmarks"):
160
+ for comp_id in comp_ids:
161
+ details = competitions.COMPETITION_DETAILS[comp_id]
162
+ with gr.Accordion(f"{details.name} Benchmarks"):
163
+ mmlu = create_benchmark_plot("mmlu", comp_id)
164
+ mmlu_pro = create_benchmark_plot("mmlu_pro", comp_id)
165
+ gr.Plot(mmlu)
166
+ gr.Plot(mmlu_pro)
167
  gr.HTML(
168
  """<div>Benchmarks computed using <a href='https://github.com/EleutherAI/lm-evaluation-harness'>lm-eval harness</a></div>"""
169
  )
competitions.py CHANGED
@@ -1,4 +1,5 @@
1
  from dataclasses import dataclass
 
2
  from typing import Dict
3
 
4
 
@@ -21,5 +22,14 @@ COMPETITION_DETAILS: Dict[int, CompetitionDetails] = {
21
  name="General Knowledge Chat-bot",
22
  # TODO: Add link to SN1 dataset details.
23
  html_description="""<b>Competition ID 2</b><br/>Produce the best general knowledge chat-bot. Models are evaluated using synthetic MMLU-like dataset from Subnet 1.""",
 
 
 
 
24
  )
25
  }
 
 
 
 
 
 
1
  from dataclasses import dataclass
2
+ import html
3
  from typing import Dict
4
 
5
 
 
22
  name="General Knowledge Chat-bot",
23
  # TODO: Add link to SN1 dataset details.
24
  html_description="""<b>Competition ID 2</b><br/>Produce the best general knowledge chat-bot. Models are evaluated using synthetic MMLU-like dataset from Subnet 1.""",
25
+ ),
26
+ 3: CompetitionDetails(
27
+ name="General Knowledge Chat-bot (BYO tokenizer)",
28
+ html_description="""<b>Competition ID 3</b><br/>Produce the best general knowledge chat-bot. Models bring their own tokenizer and are evaluated using synthetic MMLU-like dataset from Subnet 1.""",
29
  )
30
  }
31
+
32
+ COMP_NAME_TO_ID = {
33
+ "B7_MULTI_CHOICE": 2,
34
+ "INSTRUCT_8B": 3,
35
+ }
utils.py CHANGED
@@ -15,7 +15,9 @@ import pandas as pd
15
  import wandb
16
  from bittensor.extrinsics.serving import get_metadata
17
  from dotenv import load_dotenv
18
- from wandb.apis.public.history import SampledHistoryScan
 
 
19
 
20
  NETUID = 37
21
  DELAY_SECS = 3
@@ -331,17 +333,16 @@ def get_benchmarks() -> Tuple[pd.DataFrame, Dict[str, Dict[str, float]]]:
331
  runs = get_wandb_runs(
332
  project=BENCHMARK_WANDB_PROJECT, filters=None, order="+created_at"
333
  )
334
- timestamps, uids, models, mmlu, mmlu_pro = [], [], [], [], []
335
  for run in runs:
336
  uid = run.config.get("uid", None)
337
  model = run.config.get("model", None)
338
  if not uid or not model:
339
  continue
340
  samples = list(
341
- SampledHistoryScan(
342
  run.client,
343
  run,
344
- ["_timestamp", "mmlu.acc,none", "mmlu_pro"],
345
  0,
346
  1,
347
  )
@@ -349,6 +350,19 @@ def get_benchmarks() -> Tuple[pd.DataFrame, Dict[str, Dict[str, float]]]:
349
  if not samples:
350
  continue
351
  sample = samples[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  timestamps.append(datetime.datetime.fromtimestamp(sample["_timestamp"]))
353
  mmlu.append(sample["mmlu.acc,none"])
354
  mmlu_pro.append(sample["mmlu_pro"])
@@ -360,6 +374,7 @@ def get_benchmarks() -> Tuple[pd.DataFrame, Dict[str, Dict[str, float]]]:
360
  "timestamp": timestamps,
361
  "uid": uids,
362
  "model": models,
 
363
  "mmlu": mmlu,
364
  "mmlu_pro": mmlu_pro,
365
  }
@@ -463,8 +478,8 @@ def load_state_vars() -> dict[Any]:
463
  print("Loaded validator weights")
464
 
465
  # Compute loss over time for all competitions.
466
- losses_2 = get_losses_over_time(vali_runs, 2)
467
- print("Loaded losses over time for comp 2")
468
 
469
  benchmarks_df, benchmarks_targets = get_benchmarks()
470
  print("Loaded benchmarks")
@@ -486,5 +501,4 @@ def load_state_vars() -> dict[Any]:
486
  "validator_df": validator_df,
487
  "benchmarks_df": benchmarks_df,
488
  "benchmarks_targets": benchmarks_targets,
489
- "losses_2": losses_2,
490
  }
 
15
  import wandb
16
  from bittensor.extrinsics.serving import get_metadata
17
  from dotenv import load_dotenv
18
+ from wandb.apis.public.history import HistoryScan, SampledHistoryScan
19
+
20
+ from competitions import COMP_NAME_TO_ID
21
 
22
  NETUID = 37
23
  DELAY_SECS = 3
 
333
  runs = get_wandb_runs(
334
  project=BENCHMARK_WANDB_PROJECT, filters=None, order="+created_at"
335
  )
336
+ timestamps, uids, models, comp_ids, mmlu, mmlu_pro = [], [], [], [], [], []
337
  for run in runs:
338
  uid = run.config.get("uid", None)
339
  model = run.config.get("model", None)
340
  if not uid or not model:
341
  continue
342
  samples = list(
343
+ HistoryScan(
344
  run.client,
345
  run,
 
346
  0,
347
  1,
348
  )
 
350
  if not samples:
351
  continue
352
  sample = samples[0]
353
+
354
+ # Make sure we have all the required keys.
355
+ has_all_keys = True
356
+ for required_key in ["mmlu.acc,none", "mmlu_pro", "_timestamp"]:
357
+ if required_key not in sample:
358
+ has_all_keys = False
359
+ break
360
+ if not has_all_keys:
361
+ continue
362
+
363
+ # Any run without a competition ID was for competition 2.
364
+ comp_name = sample.get("competition_id", "B7_MULTI_CHOICE")
365
+ comp_ids.append(COMP_NAME_TO_ID.get(comp_name, 2))
366
  timestamps.append(datetime.datetime.fromtimestamp(sample["_timestamp"]))
367
  mmlu.append(sample["mmlu.acc,none"])
368
  mmlu_pro.append(sample["mmlu_pro"])
 
374
  "timestamp": timestamps,
375
  "uid": uids,
376
  "model": models,
377
+ "competition_id": comp_ids,
378
  "mmlu": mmlu,
379
  "mmlu_pro": mmlu_pro,
380
  }
 
478
  print("Loaded validator weights")
479
 
480
  # Compute loss over time for all competitions.
481
+ # losses_2 = get_losses_over_time(vali_runs, 2)
482
+ # print("Loaded losses over time for comp 2")
483
 
484
  benchmarks_df, benchmarks_targets = get_benchmarks()
485
  print("Loaded benchmarks")
 
501
  "validator_df": validator_df,
502
  "benchmarks_df": benchmarks_df,
503
  "benchmarks_targets": benchmarks_targets,
 
504
  }