Spaces:
Runtime error
Runtime error
rusticluftig
commited on
Commit
·
92ec2a2
1
Parent(s):
838067a
Update LB for INSTRUCT_8B comp
Browse files- app.py +47 -56
- competitions.py +10 -0
- utils.py +21 -7
app.py
CHANGED
@@ -4,10 +4,10 @@ import datetime
|
|
4 |
import os
|
5 |
|
6 |
import gradio as gr
|
|
|
7 |
from apscheduler.schedulers.background import BackgroundScheduler
|
8 |
from dotenv import load_dotenv
|
9 |
from huggingface_hub import HfApi
|
10 |
-
import matplotlib.pyplot as plt
|
11 |
|
12 |
import competitions
|
13 |
import utils
|
@@ -57,7 +57,6 @@ def main():
|
|
57 |
validator_df = state_vars["validator_df"]
|
58 |
benchmarks_df = state_vars["benchmarks_df"]
|
59 |
benchmarks_targets = state_vars["benchmarks_targets"]
|
60 |
-
losses_2 = state_vars["losses_2"]
|
61 |
|
62 |
demo = gr.Blocks(css=".typewriter {font-family: 'JMH Typewriter', sans-serif;}")
|
63 |
with demo:
|
@@ -75,58 +74,41 @@ def main():
|
|
75 |
num_top_classes=10,
|
76 |
)
|
77 |
|
|
|
78 |
with gr.Accordion("Competition Results"):
|
79 |
gr.HTML(EVALUATION_HEADER)
|
80 |
show_stale = gr.Checkbox(label="Show Stale", interactive=True)
|
81 |
competition_leaderboards = []
|
82 |
-
|
83 |
-
|
84 |
-
gr.
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
|
|
|
|
109 |
)
|
110 |
-
)
|
111 |
-
gr.LinePlot(
|
112 |
-
losses_2,
|
113 |
-
x="timestamp",
|
114 |
-
x_title="Date",
|
115 |
-
y="losses",
|
116 |
-
y_title="Score",
|
117 |
-
interactive=True,
|
118 |
-
visible=True,
|
119 |
-
width=1024,
|
120 |
-
title="Best Score Over Time",
|
121 |
-
)
|
122 |
-
gr.HTML(
|
123 |
-
"""
|
124 |
-
The definition of score changes over time as new evaluation tasks are added in releases.
|
125 |
-
<ul>
|
126 |
-
<li><b>Start-Oct 27</b>: % of wrong answers on synthetic MMLU</li>
|
127 |
-
<li><b>Oct 27-Now</b>: + word sorting eval</li>
|
128 |
-
"""
|
129 |
-
)
|
130 |
gr.HTML(
|
131 |
"""
|
132 |
<ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
|
@@ -137,17 +119,23 @@ def main():
|
|
137 |
<li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>."""
|
138 |
)
|
139 |
show_stale.change(
|
140 |
-
lambda stale: [
|
|
|
|
|
|
|
141 |
inputs=[show_stale],
|
142 |
outputs=competition_leaderboards,
|
143 |
)
|
144 |
|
145 |
if benchmarks_df is not None:
|
146 |
|
147 |
-
def create_benchmark_plot(benchmark: str):
|
148 |
fig = plt.figure(figsize=(10, 8))
|
149 |
|
150 |
-
|
|
|
|
|
|
|
151 |
|
152 |
# Adding horizontal dotted lines for various benchmark targets (well-known models)
|
153 |
for model, score in benchmarks_targets[benchmark].items():
|
@@ -169,10 +157,13 @@ def main():
|
|
169 |
return fig
|
170 |
|
171 |
with gr.Accordion("Top Model Benchmarks"):
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
|
|
|
|
|
|
176 |
gr.HTML(
|
177 |
"""<div>Benchmarks computed using <a href='https://github.com/EleutherAI/lm-evaluation-harness'>lm-eval harness</a></div>"""
|
178 |
)
|
|
|
4 |
import os
|
5 |
|
6 |
import gradio as gr
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
from apscheduler.schedulers.background import BackgroundScheduler
|
9 |
from dotenv import load_dotenv
|
10 |
from huggingface_hub import HfApi
|
|
|
11 |
|
12 |
import competitions
|
13 |
import utils
|
|
|
57 |
validator_df = state_vars["validator_df"]
|
58 |
benchmarks_df = state_vars["benchmarks_df"]
|
59 |
benchmarks_targets = state_vars["benchmarks_targets"]
|
|
|
60 |
|
61 |
demo = gr.Blocks(css=".typewriter {font-family: 'JMH Typewriter', sans-serif;}")
|
62 |
with demo:
|
|
|
74 |
num_top_classes=10,
|
75 |
)
|
76 |
|
77 |
+
comp_ids = [2, 3]
|
78 |
with gr.Accordion("Competition Results"):
|
79 |
gr.HTML(EVALUATION_HEADER)
|
80 |
show_stale = gr.Checkbox(label="Show Stale", interactive=True)
|
81 |
competition_leaderboards = []
|
82 |
+
for comp_id in comp_ids:
|
83 |
+
details = competitions.COMPETITION_DETAILS[comp_id]
|
84 |
+
with gr.Accordion(f"{details.name} Competition"):
|
85 |
+
gr.HTML(details.html_description)
|
86 |
+
competition_leaderboards.append(
|
87 |
+
gr.components.Dataframe(
|
88 |
+
value=utils.leaderboard_data(
|
89 |
+
model_data, scores, comp_id, show_stale.value
|
90 |
+
),
|
91 |
+
headers=[
|
92 |
+
"Name",
|
93 |
+
"Win Rate",
|
94 |
+
"Score",
|
95 |
+
"Weight",
|
96 |
+
"UID",
|
97 |
+
"Block",
|
98 |
+
],
|
99 |
+
datatype=[
|
100 |
+
"markdown",
|
101 |
+
"number",
|
102 |
+
"number",
|
103 |
+
"number",
|
104 |
+
"number",
|
105 |
+
"number",
|
106 |
+
],
|
107 |
+
elem_id=f"comp{comp_id}-table",
|
108 |
+
interactive=False,
|
109 |
+
visible=True,
|
110 |
+
)
|
111 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
gr.HTML(
|
113 |
"""
|
114 |
<ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
|
|
|
119 |
<li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>."""
|
120 |
)
|
121 |
show_stale.change(
|
122 |
+
lambda stale: [
|
123 |
+
utils.leaderboard_data(model_data, scores, id, stale)
|
124 |
+
for id in comp_ids
|
125 |
+
],
|
126 |
inputs=[show_stale],
|
127 |
outputs=competition_leaderboards,
|
128 |
)
|
129 |
|
130 |
if benchmarks_df is not None:
|
131 |
|
132 |
+
def create_benchmark_plot(benchmark: str, comp_id: int):
|
133 |
fig = plt.figure(figsize=(10, 8))
|
134 |
|
135 |
+
# Filter to just entries for this competition.
|
136 |
+
df = benchmarks_df[benchmarks_df["competition_id"] == comp_id]
|
137 |
+
|
138 |
+
plt.plot(df["timestamp"], df[benchmark])
|
139 |
|
140 |
# Adding horizontal dotted lines for various benchmark targets (well-known models)
|
141 |
for model, score in benchmarks_targets[benchmark].items():
|
|
|
157 |
return fig
|
158 |
|
159 |
with gr.Accordion("Top Model Benchmarks"):
|
160 |
+
for comp_id in comp_ids:
|
161 |
+
details = competitions.COMPETITION_DETAILS[comp_id]
|
162 |
+
with gr.Accordion(f"{details.name} Benchmarks"):
|
163 |
+
mmlu = create_benchmark_plot("mmlu", comp_id)
|
164 |
+
mmlu_pro = create_benchmark_plot("mmlu_pro", comp_id)
|
165 |
+
gr.Plot(mmlu)
|
166 |
+
gr.Plot(mmlu_pro)
|
167 |
gr.HTML(
|
168 |
"""<div>Benchmarks computed using <a href='https://github.com/EleutherAI/lm-evaluation-harness'>lm-eval harness</a></div>"""
|
169 |
)
|
competitions.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
from dataclasses import dataclass
|
|
|
2 |
from typing import Dict
|
3 |
|
4 |
|
@@ -21,5 +22,14 @@ COMPETITION_DETAILS: Dict[int, CompetitionDetails] = {
|
|
21 |
name="General Knowledge Chat-bot",
|
22 |
# TODO: Add link to SN1 dataset details.
|
23 |
html_description="""<b>Competition ID 2</b><br/>Produce the best general knowledge chat-bot. Models are evaluated using synthetic MMLU-like dataset from Subnet 1.""",
|
|
|
|
|
|
|
|
|
24 |
)
|
25 |
}
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from dataclasses import dataclass
|
2 |
+
import html
|
3 |
from typing import Dict
|
4 |
|
5 |
|
|
|
22 |
name="General Knowledge Chat-bot",
|
23 |
# TODO: Add link to SN1 dataset details.
|
24 |
html_description="""<b>Competition ID 2</b><br/>Produce the best general knowledge chat-bot. Models are evaluated using synthetic MMLU-like dataset from Subnet 1.""",
|
25 |
+
),
|
26 |
+
3: CompetitionDetails(
|
27 |
+
name="General Knowledge Chat-bot (BYO tokenizer)",
|
28 |
+
html_description="""<b>Competition ID 3</b><br/>Produce the best general knowledge chat-bot. Models bring their own tokenizer and are evaluated using synthetic MMLU-like dataset from Subnet 1.""",
|
29 |
)
|
30 |
}
|
31 |
+
|
32 |
+
COMP_NAME_TO_ID = {
|
33 |
+
"B7_MULTI_CHOICE": 2,
|
34 |
+
"INSTRUCT_8B": 3,
|
35 |
+
}
|
utils.py
CHANGED
@@ -15,7 +15,9 @@ import pandas as pd
|
|
15 |
import wandb
|
16 |
from bittensor.extrinsics.serving import get_metadata
|
17 |
from dotenv import load_dotenv
|
18 |
-
from wandb.apis.public.history import SampledHistoryScan
|
|
|
|
|
19 |
|
20 |
NETUID = 37
|
21 |
DELAY_SECS = 3
|
@@ -331,17 +333,16 @@ def get_benchmarks() -> Tuple[pd.DataFrame, Dict[str, Dict[str, float]]]:
|
|
331 |
runs = get_wandb_runs(
|
332 |
project=BENCHMARK_WANDB_PROJECT, filters=None, order="+created_at"
|
333 |
)
|
334 |
-
timestamps, uids, models, mmlu, mmlu_pro = [], [], [], [], []
|
335 |
for run in runs:
|
336 |
uid = run.config.get("uid", None)
|
337 |
model = run.config.get("model", None)
|
338 |
if not uid or not model:
|
339 |
continue
|
340 |
samples = list(
|
341 |
-
|
342 |
run.client,
|
343 |
run,
|
344 |
-
["_timestamp", "mmlu.acc,none", "mmlu_pro"],
|
345 |
0,
|
346 |
1,
|
347 |
)
|
@@ -349,6 +350,19 @@ def get_benchmarks() -> Tuple[pd.DataFrame, Dict[str, Dict[str, float]]]:
|
|
349 |
if not samples:
|
350 |
continue
|
351 |
sample = samples[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
352 |
timestamps.append(datetime.datetime.fromtimestamp(sample["_timestamp"]))
|
353 |
mmlu.append(sample["mmlu.acc,none"])
|
354 |
mmlu_pro.append(sample["mmlu_pro"])
|
@@ -360,6 +374,7 @@ def get_benchmarks() -> Tuple[pd.DataFrame, Dict[str, Dict[str, float]]]:
|
|
360 |
"timestamp": timestamps,
|
361 |
"uid": uids,
|
362 |
"model": models,
|
|
|
363 |
"mmlu": mmlu,
|
364 |
"mmlu_pro": mmlu_pro,
|
365 |
}
|
@@ -463,8 +478,8 @@ def load_state_vars() -> dict[Any]:
|
|
463 |
print("Loaded validator weights")
|
464 |
|
465 |
# Compute loss over time for all competitions.
|
466 |
-
losses_2 = get_losses_over_time(vali_runs, 2)
|
467 |
-
print("Loaded losses over time for comp 2")
|
468 |
|
469 |
benchmarks_df, benchmarks_targets = get_benchmarks()
|
470 |
print("Loaded benchmarks")
|
@@ -486,5 +501,4 @@ def load_state_vars() -> dict[Any]:
|
|
486 |
"validator_df": validator_df,
|
487 |
"benchmarks_df": benchmarks_df,
|
488 |
"benchmarks_targets": benchmarks_targets,
|
489 |
-
"losses_2": losses_2,
|
490 |
}
|
|
|
15 |
import wandb
|
16 |
from bittensor.extrinsics.serving import get_metadata
|
17 |
from dotenv import load_dotenv
|
18 |
+
from wandb.apis.public.history import HistoryScan, SampledHistoryScan
|
19 |
+
|
20 |
+
from competitions import COMP_NAME_TO_ID
|
21 |
|
22 |
NETUID = 37
|
23 |
DELAY_SECS = 3
|
|
|
333 |
runs = get_wandb_runs(
|
334 |
project=BENCHMARK_WANDB_PROJECT, filters=None, order="+created_at"
|
335 |
)
|
336 |
+
timestamps, uids, models, comp_ids, mmlu, mmlu_pro = [], [], [], [], [], []
|
337 |
for run in runs:
|
338 |
uid = run.config.get("uid", None)
|
339 |
model = run.config.get("model", None)
|
340 |
if not uid or not model:
|
341 |
continue
|
342 |
samples = list(
|
343 |
+
HistoryScan(
|
344 |
run.client,
|
345 |
run,
|
|
|
346 |
0,
|
347 |
1,
|
348 |
)
|
|
|
350 |
if not samples:
|
351 |
continue
|
352 |
sample = samples[0]
|
353 |
+
|
354 |
+
# Make sure we have all the required keys.
|
355 |
+
has_all_keys = True
|
356 |
+
for required_key in ["mmlu.acc,none", "mmlu_pro", "_timestamp"]:
|
357 |
+
if required_key not in sample:
|
358 |
+
has_all_keys = False
|
359 |
+
break
|
360 |
+
if not has_all_keys:
|
361 |
+
continue
|
362 |
+
|
363 |
+
# Any run without a competition ID was for competition 2.
|
364 |
+
comp_name = sample.get("competition_id", "B7_MULTI_CHOICE")
|
365 |
+
comp_ids.append(COMP_NAME_TO_ID.get(comp_name, 2))
|
366 |
timestamps.append(datetime.datetime.fromtimestamp(sample["_timestamp"]))
|
367 |
mmlu.append(sample["mmlu.acc,none"])
|
368 |
mmlu_pro.append(sample["mmlu_pro"])
|
|
|
374 |
"timestamp": timestamps,
|
375 |
"uid": uids,
|
376 |
"model": models,
|
377 |
+
"competition_id": comp_ids,
|
378 |
"mmlu": mmlu,
|
379 |
"mmlu_pro": mmlu_pro,
|
380 |
}
|
|
|
478 |
print("Loaded validator weights")
|
479 |
|
480 |
# Compute loss over time for all competitions.
|
481 |
+
# losses_2 = get_losses_over_time(vali_runs, 2)
|
482 |
+
# print("Loaded losses over time for comp 2")
|
483 |
|
484 |
benchmarks_df, benchmarks_targets = get_benchmarks()
|
485 |
print("Loaded benchmarks")
|
|
|
501 |
"validator_df": validator_df,
|
502 |
"benchmarks_df": benchmarks_df,
|
503 |
"benchmarks_targets": benchmarks_targets,
|
|
|
504 |
}
|