emozilla commited on
Commit
1684477
·
1 Parent(s): 20271a6

add samples and perplexity

Browse files
Files changed (1) hide show
  1. app.py +27 -8
app.py CHANGED
@@ -19,13 +19,12 @@ FONT = """<link href="https://fonts.cdnfonts.com/css/jmh-typewriter" rel="styles
19
  TITLE = """<h1 align="center" id="space-title" class="typewriter">Subnet 6 Leaderboard</h1>"""
20
  IMAGE = """<a href="https://discord.gg/jqVphNsB4H" target="_blank"><img src="https://i.ibb.co/88wyVQ7/nousgirl.png" alt="nousgirl" style="margin: auto; width: 20%; border: 0;" /></a>"""
21
  HEADER = """<h2 align="center" class="typewriter"><a href="https://github.com/NousResearch/finetuning-subnet" target="_blank">Subnet 6</a> is a <a href="https://bittensor.com/" target="_blank">Bittensor</a> subnet that incentivizes the creation of the best open models by evaluating submissions on a constant stream of newly generated synthetic GPT-4 data. The models with the best <a href="https://github.com/NousResearch/finetuning-subnet/blob/master/docs/validator.md" target="_blank">head-to-head loss</a> on the evaluation data receive a steady emission of TAO.</h3>"""
22
- EVALUATION_DETAILS = """<b>Name</b> is the 🤗 Hugging Face model name (click to go to the model card). <b>Rewards / Day</b> are the expected rewards per day for each model. <b>Last Average Loss</b> is the last loss value on the evaluation data for the model as calculated by a validator (lower is better). <b>UID</b> is the Bittensor user id of the submitter. <b>Block</b> is the Bittensor block that the model was submitted in. More stats on <a href="https://taostats.io/subnets/netuid-6/" target="_blank">taostats</a>."""
23
  EVALUATION_HEADER = """<h3 align="center">Shows the latest internal evaluation statistics as calculated by a validator run by Nous Research</h3>"""
24
  VALIDATOR_WANDB_PROJECT = os.environ["VALIDATOR_WANDB_PROJECT"]
25
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
26
  API = HfApi(token=H4_TOKEN)
27
  REPO_ID = "NousResearch/finetuning_subnet_leaderboard"
28
- MAX_AVG_LOSS_POINTS = 1
29
  METAGRAPH_RETRIES = 5
30
  METAGRAPH_DELAY_SECS = 3
31
  NETUID = 6
@@ -135,7 +134,17 @@ def get_float_score(key: str, history) -> typing.Tuple[typing.Optional[float], b
135
  return float(data[-1]), False
136
  return None, False
137
 
138
- def get_scores(uids: typing.List[int]) -> typing.Dict[int, typing.Dict[str, typing.Optional[float]]]:
 
 
 
 
 
 
 
 
 
 
139
  api = wandb.Api()
140
  runs = list(api.runs(VALIDATOR_WANDB_PROJECT))
141
 
@@ -145,16 +154,18 @@ def get_scores(uids: typing.List[int]) -> typing.Dict[int, typing.Dict[str, typi
145
  for uid in uids:
146
  if uid in result.keys():
147
  continue
148
- avg_loss, avg_loss_fresh = get_float_score(f"uid_data.{uid}", history)
149
  win_rate, win_rate_fresh = get_float_score(f"win_rate_data.{uid}", history)
150
  win_total, win_total_fresh = get_float_score(f"win_total_data.{uid}", history)
151
  weight, weight_fresh = get_float_score(f"weight_data.{uid}", history)
 
152
  result[uid] = {
153
- "avg_loss": avg_loss,
154
  "win_rate": win_rate,
155
  "win_total": win_total,
156
  "weight": weight,
157
- "fresh": avg_loss_fresh and win_rate_fresh and win_total_fresh
 
158
  }
159
  if len(result.keys()) == len(uids):
160
  break
@@ -208,7 +219,7 @@ def leaderboard_data(show_stale: bool):
208
  [
209
  f'[{c.namespace}/{c.name} ({c.commit[0:8]})](https://huggingface.co/{c.namespace}/{c.name}/commit/{c.commit})',
210
  format_score(c.uid, scores, "win_rate"),
211
- format_score(c.uid, scores, "avg_loss"),
212
  format_score(c.uid, scores, "weight"),
213
  c.uid,
214
  c.block
@@ -232,10 +243,18 @@ with demo:
232
 
233
  with gr.Accordion("Evaluation Stats"):
234
  gr.HTML(EVALUATION_HEADER)
 
 
 
 
 
 
 
 
235
  show_stale = gr.Checkbox(label="Show Stale", interactive=True)
236
  leaderboard_table = gr.components.Dataframe(
237
  value=leaderboard_data(show_stale.value),
238
- headers=["Name", "Win Rate", "Average Loss", "Weight", "UID", "Block"],
239
  datatype=["markdown", "number", "number", "number", "number", "number"],
240
  elem_id="leaderboard-table",
241
  interactive=False,
 
19
  TITLE = """<h1 align="center" id="space-title" class="typewriter">Subnet 6 Leaderboard</h1>"""
20
  IMAGE = """<a href="https://discord.gg/jqVphNsB4H" target="_blank"><img src="https://i.ibb.co/88wyVQ7/nousgirl.png" alt="nousgirl" style="margin: auto; width: 20%; border: 0;" /></a>"""
21
  HEADER = """<h2 align="center" class="typewriter"><a href="https://github.com/NousResearch/finetuning-subnet" target="_blank">Subnet 6</a> is a <a href="https://bittensor.com/" target="_blank">Bittensor</a> subnet that incentivizes the creation of the best open models by evaluating submissions on a constant stream of newly generated synthetic GPT-4 data. The models with the best <a href="https://github.com/NousResearch/finetuning-subnet/blob/master/docs/validator.md" target="_blank">head-to-head loss</a> on the evaluation data receive a steady emission of TAO.</h3>"""
22
+ EVALUATION_DETAILS = """<b>Name</b> is the 🤗 Hugging Face model name (click to go to the model card). <b>Rewards / Day</b> are the expected rewards per day for each model. <b>Perplexity</b> is represents the loss on all of the evaluation data for the model as calculated by the validator (lower is better). <b>UID</b> is the Bittensor user id of the submitter. <b>Block</b> is the Bittensor block that the model was submitted in. More stats on <a href="https://taostats.io/subnets/netuid-6/" target="_blank">taostats</a>."""
23
  EVALUATION_HEADER = """<h3 align="center">Shows the latest internal evaluation statistics as calculated by a validator run by Nous Research</h3>"""
24
  VALIDATOR_WANDB_PROJECT = os.environ["VALIDATOR_WANDB_PROJECT"]
25
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
26
  API = HfApi(token=H4_TOKEN)
27
  REPO_ID = "NousResearch/finetuning_subnet_leaderboard"
 
28
  METAGRAPH_RETRIES = 5
29
  METAGRAPH_DELAY_SECS = 3
30
  NETUID = 6
 
134
  return float(data[-1]), False
135
  return None, False
136
 
137
+ def get_sample(uid, history) -> typing.Optional[typing.Tuple[str, str]]:
138
+ prompt_key = f"sample_prompt_data.{uid}"
139
+ response_key = f"sample_response_data.{uid}"
140
+ if prompt_key and response_key in history:
141
+ prompt = list(history[prompt_key])[-1]
142
+ response = list(history[response_key])[-1]
143
+ if isinstance(prompt, str) and isinstance(response, str):
144
+ return prompt, response
145
+ return None
146
+
147
+ def get_scores(uids: typing.List[int]) -> typing.Dict[int, typing.Dict[str, typing.Optional[float | str]]]:
148
  api = wandb.Api()
149
  runs = list(api.runs(VALIDATOR_WANDB_PROJECT))
150
 
 
154
  for uid in uids:
155
  if uid in result.keys():
156
  continue
157
+ perplexity, perplexity_fresh = get_float_score(f"perplexity_data.{uid}", history)
158
  win_rate, win_rate_fresh = get_float_score(f"win_rate_data.{uid}", history)
159
  win_total, win_total_fresh = get_float_score(f"win_total_data.{uid}", history)
160
  weight, weight_fresh = get_float_score(f"weight_data.{uid}", history)
161
+ sample = get_sample(uid, history)
162
  result[uid] = {
163
+ "perplexity": perplexity,
164
  "win_rate": win_rate,
165
  "win_total": win_total,
166
  "weight": weight,
167
+ "sample": sample,
168
+ "fresh": perplexity_fresh and win_rate_fresh and win_total_fresh
169
  }
170
  if len(result.keys()) == len(uids):
171
  break
 
219
  [
220
  f'[{c.namespace}/{c.name} ({c.commit[0:8]})](https://huggingface.co/{c.namespace}/{c.name}/commit/{c.commit})',
221
  format_score(c.uid, scores, "win_rate"),
222
+ format_score(c.uid, scores, "perplexity"),
223
  format_score(c.uid, scores, "weight"),
224
  c.uid,
225
  c.block
 
243
 
244
  with gr.Accordion("Evaluation Stats"):
245
  gr.HTML(EVALUATION_HEADER)
246
+
247
+ with gr.Tabs():
248
+ for entry in leaderboard_df:
249
+ sample = scores[entry.uid]["sample"]
250
+ if sample is not None:
251
+ with gr.Tab(f"{entry.namespace}/{entry.name} ({entry.commit[0:8]})"):
252
+ gr.Chatbot([sample])
253
+
254
  show_stale = gr.Checkbox(label="Show Stale", interactive=True)
255
  leaderboard_table = gr.components.Dataframe(
256
  value=leaderboard_data(show_stale.value),
257
+ headers=["Name", "Win Rate", "Perplexity", "Weight", "UID", "Block"],
258
  datatype=["markdown", "number", "number", "number", "number", "number"],
259
  elem_id="leaderboard-table",
260
  interactive=False,