Spaces:
Runtime error
Runtime error
add samples and perplexity
Browse files
app.py
CHANGED
@@ -19,13 +19,12 @@ FONT = """<link href="https://fonts.cdnfonts.com/css/jmh-typewriter" rel="styles
|
|
19 |
TITLE = """<h1 align="center" id="space-title" class="typewriter">Subnet 6 Leaderboard</h1>"""
|
20 |
IMAGE = """<a href="https://discord.gg/jqVphNsB4H" target="_blank"><img src="https://i.ibb.co/88wyVQ7/nousgirl.png" alt="nousgirl" style="margin: auto; width: 20%; border: 0;" /></a>"""
|
21 |
HEADER = """<h2 align="center" class="typewriter"><a href="https://github.com/NousResearch/finetuning-subnet" target="_blank">Subnet 6</a> is a <a href="https://bittensor.com/" target="_blank">Bittensor</a> subnet that incentivizes the creation of the best open models by evaluating submissions on a constant stream of newly generated synthetic GPT-4 data. The models with the best <a href="https://github.com/NousResearch/finetuning-subnet/blob/master/docs/validator.md" target="_blank">head-to-head loss</a> on the evaluation data receive a steady emission of TAO.</h3>"""
|
22 |
-
EVALUATION_DETAILS = """<b>Name</b> is the 🤗 Hugging Face model name (click to go to the model card). <b>Rewards / Day</b> are the expected rewards per day for each model. <b>
|
23 |
EVALUATION_HEADER = """<h3 align="center">Shows the latest internal evaluation statistics as calculated by a validator run by Nous Research</h3>"""
|
24 |
VALIDATOR_WANDB_PROJECT = os.environ["VALIDATOR_WANDB_PROJECT"]
|
25 |
H4_TOKEN = os.environ.get("H4_TOKEN", None)
|
26 |
API = HfApi(token=H4_TOKEN)
|
27 |
REPO_ID = "NousResearch/finetuning_subnet_leaderboard"
|
28 |
-
MAX_AVG_LOSS_POINTS = 1
|
29 |
METAGRAPH_RETRIES = 5
|
30 |
METAGRAPH_DELAY_SECS = 3
|
31 |
NETUID = 6
|
@@ -135,7 +134,17 @@ def get_float_score(key: str, history) -> typing.Tuple[typing.Optional[float], b
|
|
135 |
return float(data[-1]), False
|
136 |
return None, False
|
137 |
|
138 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
api = wandb.Api()
|
140 |
runs = list(api.runs(VALIDATOR_WANDB_PROJECT))
|
141 |
|
@@ -145,16 +154,18 @@ def get_scores(uids: typing.List[int]) -> typing.Dict[int, typing.Dict[str, typi
|
|
145 |
for uid in uids:
|
146 |
if uid in result.keys():
|
147 |
continue
|
148 |
-
|
149 |
win_rate, win_rate_fresh = get_float_score(f"win_rate_data.{uid}", history)
|
150 |
win_total, win_total_fresh = get_float_score(f"win_total_data.{uid}", history)
|
151 |
weight, weight_fresh = get_float_score(f"weight_data.{uid}", history)
|
|
|
152 |
result[uid] = {
|
153 |
-
"
|
154 |
"win_rate": win_rate,
|
155 |
"win_total": win_total,
|
156 |
"weight": weight,
|
157 |
-
"
|
|
|
158 |
}
|
159 |
if len(result.keys()) == len(uids):
|
160 |
break
|
@@ -208,7 +219,7 @@ def leaderboard_data(show_stale: bool):
|
|
208 |
[
|
209 |
f'[{c.namespace}/{c.name} ({c.commit[0:8]})](https://huggingface.co/{c.namespace}/{c.name}/commit/{c.commit})',
|
210 |
format_score(c.uid, scores, "win_rate"),
|
211 |
-
format_score(c.uid, scores, "
|
212 |
format_score(c.uid, scores, "weight"),
|
213 |
c.uid,
|
214 |
c.block
|
@@ -232,10 +243,18 @@ with demo:
|
|
232 |
|
233 |
with gr.Accordion("Evaluation Stats"):
|
234 |
gr.HTML(EVALUATION_HEADER)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
show_stale = gr.Checkbox(label="Show Stale", interactive=True)
|
236 |
leaderboard_table = gr.components.Dataframe(
|
237 |
value=leaderboard_data(show_stale.value),
|
238 |
-
headers=["Name", "Win Rate", "
|
239 |
datatype=["markdown", "number", "number", "number", "number", "number"],
|
240 |
elem_id="leaderboard-table",
|
241 |
interactive=False,
|
|
|
19 |
TITLE = """<h1 align="center" id="space-title" class="typewriter">Subnet 6 Leaderboard</h1>"""
|
20 |
IMAGE = """<a href="https://discord.gg/jqVphNsB4H" target="_blank"><img src="https://i.ibb.co/88wyVQ7/nousgirl.png" alt="nousgirl" style="margin: auto; width: 20%; border: 0;" /></a>"""
|
21 |
HEADER = """<h2 align="center" class="typewriter"><a href="https://github.com/NousResearch/finetuning-subnet" target="_blank">Subnet 6</a> is a <a href="https://bittensor.com/" target="_blank">Bittensor</a> subnet that incentivizes the creation of the best open models by evaluating submissions on a constant stream of newly generated synthetic GPT-4 data. The models with the best <a href="https://github.com/NousResearch/finetuning-subnet/blob/master/docs/validator.md" target="_blank">head-to-head loss</a> on the evaluation data receive a steady emission of TAO.</h3>"""
|
22 |
+
EVALUATION_DETAILS = """<b>Name</b> is the 🤗 Hugging Face model name (click to go to the model card). <b>Rewards / Day</b> are the expected rewards per day for each model. <b>Perplexity</b> is represents the loss on all of the evaluation data for the model as calculated by the validator (lower is better). <b>UID</b> is the Bittensor user id of the submitter. <b>Block</b> is the Bittensor block that the model was submitted in. More stats on <a href="https://taostats.io/subnets/netuid-6/" target="_blank">taostats</a>."""
|
23 |
EVALUATION_HEADER = """<h3 align="center">Shows the latest internal evaluation statistics as calculated by a validator run by Nous Research</h3>"""
|
24 |
VALIDATOR_WANDB_PROJECT = os.environ["VALIDATOR_WANDB_PROJECT"]
|
25 |
H4_TOKEN = os.environ.get("H4_TOKEN", None)
|
26 |
API = HfApi(token=H4_TOKEN)
|
27 |
REPO_ID = "NousResearch/finetuning_subnet_leaderboard"
|
|
|
28 |
METAGRAPH_RETRIES = 5
|
29 |
METAGRAPH_DELAY_SECS = 3
|
30 |
NETUID = 6
|
|
|
134 |
return float(data[-1]), False
|
135 |
return None, False
|
136 |
|
137 |
+
def get_sample(uid, history) -> typing.Optional[typing.Tuple[str, str]]:
|
138 |
+
prompt_key = f"sample_prompt_data.{uid}"
|
139 |
+
response_key = f"sample_response_data.{uid}"
|
140 |
+
if prompt_key and response_key in history:
|
141 |
+
prompt = list(history[prompt_key])[-1]
|
142 |
+
response = list(history[response_key])[-1]
|
143 |
+
if isinstance(prompt, str) and isinstance(response, str):
|
144 |
+
return prompt, response
|
145 |
+
return None
|
146 |
+
|
147 |
+
def get_scores(uids: typing.List[int]) -> typing.Dict[int, typing.Dict[str, typing.Optional[float | str]]]:
|
148 |
api = wandb.Api()
|
149 |
runs = list(api.runs(VALIDATOR_WANDB_PROJECT))
|
150 |
|
|
|
154 |
for uid in uids:
|
155 |
if uid in result.keys():
|
156 |
continue
|
157 |
+
perplexity, perplexity_fresh = get_float_score(f"perplexity_data.{uid}", history)
|
158 |
win_rate, win_rate_fresh = get_float_score(f"win_rate_data.{uid}", history)
|
159 |
win_total, win_total_fresh = get_float_score(f"win_total_data.{uid}", history)
|
160 |
weight, weight_fresh = get_float_score(f"weight_data.{uid}", history)
|
161 |
+
sample = get_sample(uid, history)
|
162 |
result[uid] = {
|
163 |
+
"perplexity": perplexity,
|
164 |
"win_rate": win_rate,
|
165 |
"win_total": win_total,
|
166 |
"weight": weight,
|
167 |
+
"sample": sample,
|
168 |
+
"fresh": perplexity_fresh and win_rate_fresh and win_total_fresh
|
169 |
}
|
170 |
if len(result.keys()) == len(uids):
|
171 |
break
|
|
|
219 |
[
|
220 |
f'[{c.namespace}/{c.name} ({c.commit[0:8]})](https://huggingface.co/{c.namespace}/{c.name}/commit/{c.commit})',
|
221 |
format_score(c.uid, scores, "win_rate"),
|
222 |
+
format_score(c.uid, scores, "perplexity"),
|
223 |
format_score(c.uid, scores, "weight"),
|
224 |
c.uid,
|
225 |
c.block
|
|
|
243 |
|
244 |
with gr.Accordion("Evaluation Stats"):
|
245 |
gr.HTML(EVALUATION_HEADER)
|
246 |
+
|
247 |
+
with gr.Tabs():
|
248 |
+
for entry in leaderboard_df:
|
249 |
+
sample = scores[entry.uid]["sample"]
|
250 |
+
if sample is not None:
|
251 |
+
with gr.Tab(f"{entry.namespace}/{entry.name} ({entry.commit[0:8]})"):
|
252 |
+
gr.Chatbot([sample])
|
253 |
+
|
254 |
show_stale = gr.Checkbox(label="Show Stale", interactive=True)
|
255 |
leaderboard_table = gr.components.Dataframe(
|
256 |
value=leaderboard_data(show_stale.value),
|
257 |
+
headers=["Name", "Win Rate", "Perplexity", "Weight", "UID", "Block"],
|
258 |
datatype=["markdown", "number", "number", "number", "number", "number"],
|
259 |
elem_id="leaderboard-table",
|
260 |
interactive=False,
|