finetuning_subnet_leaderboard

Runtime error

App Files Files Community

emozilla commited on Feb 6, 2024

Commit

1684477

1 Parent(s): 20271a6

add samples and perplexity

Browse files

Files changed (1) hide show

app.py +27 -8

app.py CHANGED Viewed

@@ -19,13 +19,12 @@ FONT = """<link href="https://fonts.cdnfonts.com/css/jmh-typewriter" rel="styles
 TITLE = """<h1 align="center" id="space-title" class="typewriter">Subnet 6 Leaderboard</h1>"""
 IMAGE = """<a href="https://discord.gg/jqVphNsB4H" target="_blank"><img src="https://i.ibb.co/88wyVQ7/nousgirl.png" alt="nousgirl" style="margin: auto; width: 20%; border: 0;" /></a>"""
 HEADER = """<h2 align="center" class="typewriter"><a href="https://github.com/NousResearch/finetuning-subnet" target="_blank">Subnet 6</a> is a <a href="https://bittensor.com/" target="_blank">Bittensor</a> subnet that incentivizes the creation of the best open models by evaluating submissions on a constant stream of newly generated synthetic GPT-4 data. The models with the best <a href="https://github.com/NousResearch/finetuning-subnet/blob/master/docs/validator.md" target="_blank">head-to-head loss</a> on the evaluation data receive a steady emission of TAO.</h3>"""
-EVALUATION_DETAILS = """<b>Name</b> is the 🤗 Hugging Face model name (click to go to the model card). <b>Rewards / Day</b> are the expected rewards per day for each model. <b>Last Average Loss</b> is the last loss value on the evaluation data for the model as calculated by a validator (lower is better). <b>UID</b> is the Bittensor user id of the submitter. <b>Block</b> is the Bittensor block that the model was submitted in. More stats on <a href="https://taostats.io/subnets/netuid-6/" target="_blank">taostats</a>."""
 EVALUATION_HEADER = """<h3 align="center">Shows the latest internal evaluation statistics as calculated by a validator run by Nous Research</h3>"""
 VALIDATOR_WANDB_PROJECT = os.environ["VALIDATOR_WANDB_PROJECT"]
 H4_TOKEN = os.environ.get("H4_TOKEN", None)
 API = HfApi(token=H4_TOKEN)
 REPO_ID = "NousResearch/finetuning_subnet_leaderboard"
-MAX_AVG_LOSS_POINTS = 1
 METAGRAPH_RETRIES = 5
 METAGRAPH_DELAY_SECS = 3
 NETUID = 6
@@ -135,7 +134,17 @@ def get_float_score(key: str, history) -> typing.Tuple[typing.Optional[float], b
                     return float(data[-1]), False
     return None, False
-def get_scores(uids: typing.List[int]) -> typing.Dict[int, typing.Dict[str, typing.Optional[float]]]:
     api = wandb.Api()
     runs = list(api.runs(VALIDATOR_WANDB_PROJECT))
@@ -145,16 +154,18 @@ def get_scores(uids: typing.List[int]) -> typing.Dict[int, typing.Dict[str, typi
         for uid in uids:
             if uid in result.keys():
                 continue
-            avg_loss, avg_loss_fresh = get_float_score(f"uid_data.{uid}", history)
             win_rate, win_rate_fresh = get_float_score(f"win_rate_data.{uid}", history)
             win_total, win_total_fresh = get_float_score(f"win_total_data.{uid}", history)
             weight, weight_fresh = get_float_score(f"weight_data.{uid}", history)
             result[uid] = {
-                "avg_loss": avg_loss,
                 "win_rate": win_rate,
                 "win_total": win_total,
                 "weight": weight,
-                "fresh": avg_loss_fresh and win_rate_fresh and win_total_fresh
             }
         if len(result.keys()) == len(uids):
             break
@@ -208,7 +219,7 @@ def leaderboard_data(show_stale: bool):
         [
             f'[{c.namespace}/{c.name} ({c.commit[0:8]})](https://huggingface.co/{c.namespace}/{c.name}/commit/{c.commit})',
             format_score(c.uid, scores, "win_rate"),
-            format_score(c.uid, scores, "avg_loss"),
             format_score(c.uid, scores, "weight"),
             c.uid,
             c.block
@@ -232,10 +243,18 @@ with demo:
     with gr.Accordion("Evaluation Stats"):
         gr.HTML(EVALUATION_HEADER)
         show_stale = gr.Checkbox(label="Show Stale", interactive=True)
         leaderboard_table = gr.components.Dataframe(
             value=leaderboard_data(show_stale.value),
-            headers=["Name", "Win Rate", "Average Loss", "Weight", "UID", "Block"],
             datatype=["markdown", "number", "number", "number", "number", "number"],
             elem_id="leaderboard-table",
             interactive=False,

 TITLE = """<h1 align="center" id="space-title" class="typewriter">Subnet 6 Leaderboard</h1>"""
 IMAGE = """<a href="https://discord.gg/jqVphNsB4H" target="_blank"><img src="https://i.ibb.co/88wyVQ7/nousgirl.png" alt="nousgirl" style="margin: auto; width: 20%; border: 0;" /></a>"""
 HEADER = """<h2 align="center" class="typewriter"><a href="https://github.com/NousResearch/finetuning-subnet" target="_blank">Subnet 6</a> is a <a href="https://bittensor.com/" target="_blank">Bittensor</a> subnet that incentivizes the creation of the best open models by evaluating submissions on a constant stream of newly generated synthetic GPT-4 data. The models with the best <a href="https://github.com/NousResearch/finetuning-subnet/blob/master/docs/validator.md" target="_blank">head-to-head loss</a> on the evaluation data receive a steady emission of TAO.</h3>"""
+EVALUATION_DETAILS = """<b>Name</b> is the 🤗 Hugging Face model name (click to go to the model card). <b>Rewards / Day</b> are the expected rewards per day for each model. <b>Perplexity</b> is represents the loss on all of the evaluation data for the model as calculated by the validator (lower is better). <b>UID</b> is the Bittensor user id of the submitter. <b>Block</b> is the Bittensor block that the model was submitted in. More stats on <a href="https://taostats.io/subnets/netuid-6/" target="_blank">taostats</a>."""
 EVALUATION_HEADER = """<h3 align="center">Shows the latest internal evaluation statistics as calculated by a validator run by Nous Research</h3>"""
 VALIDATOR_WANDB_PROJECT = os.environ["VALIDATOR_WANDB_PROJECT"]
 H4_TOKEN = os.environ.get("H4_TOKEN", None)
 API = HfApi(token=H4_TOKEN)
 REPO_ID = "NousResearch/finetuning_subnet_leaderboard"
 METAGRAPH_RETRIES = 5
 METAGRAPH_DELAY_SECS = 3
 NETUID = 6
                     return float(data[-1]), False
     return None, False
+def get_sample(uid, history) -> typing.Optional[typing.Tuple[str, str]]:
+    prompt_key = f"sample_prompt_data.{uid}"
+    response_key = f"sample_response_data.{uid}"
+    if prompt_key and response_key in history:
+        prompt = list(history[prompt_key])[-1]
+        response = list(history[response_key])[-1]
+        if isinstance(prompt, str) and isinstance(response, str):
+            return prompt, response
+    return None
+def get_scores(uids: typing.List[int]) -> typing.Dict[int, typing.Dict[str, typing.Optional[float | str]]]:
     api = wandb.Api()
     runs = list(api.runs(VALIDATOR_WANDB_PROJECT))
         for uid in uids:
             if uid in result.keys():
                 continue
+            perplexity, perplexity_fresh = get_float_score(f"perplexity_data.{uid}", history)
             win_rate, win_rate_fresh = get_float_score(f"win_rate_data.{uid}", history)
             win_total, win_total_fresh = get_float_score(f"win_total_data.{uid}", history)
             weight, weight_fresh = get_float_score(f"weight_data.{uid}", history)
+            sample = get_sample(uid, history)
             result[uid] = {
+                "perplexity": perplexity,
                 "win_rate": win_rate,
                 "win_total": win_total,
                 "weight": weight,
+                "sample": sample,
+                "fresh": perplexity_fresh and win_rate_fresh and win_total_fresh
             }
         if len(result.keys()) == len(uids):
             break
         [
             f'[{c.namespace}/{c.name} ({c.commit[0:8]})](https://huggingface.co/{c.namespace}/{c.name}/commit/{c.commit})',
             format_score(c.uid, scores, "win_rate"),
+            format_score(c.uid, scores, "perplexity"),
             format_score(c.uid, scores, "weight"),
             c.uid,
             c.block
     with gr.Accordion("Evaluation Stats"):
         gr.HTML(EVALUATION_HEADER)
+        with gr.Tabs():
+            for entry in leaderboard_df:
+                sample = scores[entry.uid]["sample"]
+                if sample is not None:
+                    with gr.Tab(f"{entry.namespace}/{entry.name} ({entry.commit[0:8]})"):
+                        gr.Chatbot([sample])
         show_stale = gr.Checkbox(label="Show Stale", interactive=True)
         leaderboard_table = gr.components.Dataframe(
             value=leaderboard_data(show_stale.value),
+            headers=["Name", "Win Rate", "Perplexity", "Weight", "UID", "Block"],
             datatype=["markdown", "number", "number", "number", "number", "number"],
             elem_id="leaderboard-table",
             interactive=False,