Spaces:

ttsds
/

benchmark

Runtime error

App Files Files Community

cdminix commited on Jul 18, 2024

Commit

026ee6b

1 Parent(s): 9c2d40e

setup leaderboard

Browse files

Files changed (11) hide show

README.md +4 -4
app.py +78 -116
new/app.py +270 -0
new/requirements.txt +17 -0
new/src/css_html_js.py +101 -0
new/src/envs.py +38 -0
new/src/texts.py +37 -0
src/about.py +18 -30
src/display/utils.py +8 -72
src/leaderboard/read_evals.py +11 -65
src/submission/submit.py +3 -3

README.md CHANGED Viewed

@@ -17,9 +17,9 @@ Results files should have the following format and be stored as json files:
 ```json
 {
     "config": {
-        "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
-        "model_name": "path of the model on the hub: org/model",
-        "model_sha": "revision on the hub",
     },
     "results": {
         "task_name": {
@@ -41,4 +41,4 @@ If you encounter problem on the space, don't hesitate to restart it to remove th
 You'll find
 - the main table' columns names and properties in `src/display/utils.py`
 - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
-- teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

 ```json
 {
     "config": {
+        "model_name": "name of the model",
+        "model_url": "url of the model",
+        "tags": ["tag1", "tag2"], // e.g. ["flow", "diffusion", "autoregressive", "end-to-end"]
     },
     "results": {
         "task_name": {
 You'll find
 - the main table' columns names and properties in `src/display/utils.py`
 - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
+- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

app.py CHANGED Viewed

@@ -19,10 +19,7 @@ from src.display.utils import (
     EVAL_COLS,
     EVAL_TYPES,
     AutoEvalColumn,
-    ModelType,
     fields,
-    WeightType,
-    Precision,
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
@@ -80,11 +77,9 @@ def init_leaderboard(dataframe):
             cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
             label="Select Columns to Display:",
         ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
         hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
         filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
             ColumnFilter(
                 AutoEvalColumn.params.name,
                 type="slider",
@@ -92,7 +87,6 @@ def init_leaderboard(dataframe):
                 max=150,
                 label="Select the number of parameters (B)",
             ),
-            ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
         ],
         bool_checkboxgroup_label="Hide models",
         interactive=False,
@@ -100,129 +94,97 @@ def init_leaderboard(dataframe):
 def show_leaderboard(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None):
-    global demo
     if profile or True:
-        print(f"Logged in as {profile.name}")
-        with demo:
-            gr.HTML(TITLE)
-            gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-            with gr.Tabs(elem_classes="tab-buttons") as tabs:
-                with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-                    leaderboard = init_leaderboard(LEADERBOARD_DF)
-                with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-                    gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-                with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-                    with gr.Column():
-                        with gr.Row():
-                            gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                        with gr.Column():
-                            with gr.Accordion(
-                                f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                                open=False,
-                            ):
-                                with gr.Row():
-                                    finished_eval_table = gr.components.Dataframe(
-                                        value=finished_eval_queue_df,
-                                        headers=EVAL_COLS,
-                                        datatype=EVAL_TYPES,
-                                        row_count=5,
-                                    )
-                            with gr.Accordion(
-                                f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                                open=False,
-                            ):
-                                with gr.Row():
-                                    running_eval_table = gr.components.Dataframe(
-                                        value=running_eval_queue_df,
-                                        headers=EVAL_COLS,
-                                        datatype=EVAL_TYPES,
-                                        row_count=5,
-                                    )
-                            with gr.Accordion(
-                                f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                                open=False,
-                            ):
-                                with gr.Row():
-                                    pending_eval_table = gr.components.Dataframe(
-                                        value=pending_eval_queue_df,
-                                        headers=EVAL_COLS,
-                                        datatype=EVAL_TYPES,
-                                        row_count=5,
-                                    )
                     with gr.Row():
-                        gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
-                    with gr.Row():
-                        with gr.Column():
-                            model_name_textbox = gr.Textbox(label="Model name")
-                            revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                            model_type = gr.Dropdown(
-                                choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                                label="Model type",
-                                multiselect=False,
-                                value=None,
-                                interactive=True,
-                            )
-                        with gr.Column():
-                            precision = gr.Dropdown(
-                                choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                                label="Precision",
-                                multiselect=False,
-                                value="float16",
-                                interactive=True,
-                            )
-                            weight_type = gr.Dropdown(
-                                choices=[i.value.name for i in WeightType],
-                                label="Weights type",
-                                multiselect=False,
-                                value="Original",
-                                interactive=True,
-                            )
-                            base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-                    submit_button = gr.Button("Submit Eval")
-                    submission_result = gr.Markdown()
-                    # submit_button.click(
-                    #     add_new_eval,
-                    #     [
-                    #         model_name_textbox,
-                    #         base_model_name_textbox,
-                    #         revision_name_textbox,
-                    #         precision,
-                    #         weight_type,
-                    #         model_type,
-                    #     ],
-                    #     submission_result,
-                    # )
-            with gr.Row():
-                with gr.Accordion("📙 Citation", open=False):
-                    citation_button = gr.Textbox(
-                        value=CITATION_BUTTON_TEXT,
-                        label=CITATION_BUTTON_LABEL,
-                        lines=20,
-                        elem_id="citation-button",
-                        show_copy_button=True,
-                    )
 demo = gr.Blocks(css=custom_css)
 with demo:
-    gr.LoginButton()
     m1 = gr.Markdown("Please login to see the leaderboard.")
-    demo.load(show_leaderboard, inputs=None, outputs=m1)
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
-# demo.queue(default_concurrency_limit=40).launch()
-demo.launch()

     EVAL_COLS,
     EVAL_TYPES,
     AutoEvalColumn,
     fields,
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
             cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
             label="Select Columns to Display:",
         ),
+        search_columns=[AutoEvalColumn.model.name],
         hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
         filter_columns=[
             ColumnFilter(
                 AutoEvalColumn.params.name,
                 type="slider",
                 max=150,
                 label="Select the number of parameters (B)",
             ),
         ],
         bool_checkboxgroup_label="Hide models",
         interactive=False,
 def show_leaderboard(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None):
     if profile or True:
+        gr.HTML(TITLE)
+        gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+        with gr.Tabs(elem_classes="tab-buttons") as tabs:
+            with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
+                leaderboard = init_leaderboard(LEADERBOARD_DF)
+            with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
+                gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+            with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
+                with gr.Column():
                     with gr.Row():
+                        gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+                    with gr.Column():
+                        with gr.Accordion(
+                            f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
+                            open=False,
+                        ):
+                            with gr.Row():
+                                finished_eval_table = gr.components.Dataframe(
+                                    value=finished_eval_queue_df,
+                                    headers=EVAL_COLS,
+                                    datatype=EVAL_TYPES,
+                                    row_count=5,
+                                )
+                        with gr.Accordion(
+                            f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
+                            open=False,
+                        ):
+                            with gr.Row():
+                                running_eval_table = gr.components.Dataframe(
+                                    value=running_eval_queue_df,
+                                    headers=EVAL_COLS,
+                                    datatype=EVAL_TYPES,
+                                    row_count=5,
+                                )
+                        with gr.Accordion(
+                            f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
+                            open=False,
+                        ):
+                            with gr.Row():
+                                pending_eval_table = gr.components.Dataframe(
+                                    value=pending_eval_queue_df,
+                                    headers=EVAL_COLS,
+                                    datatype=EVAL_TYPES,
+                                    row_count=5,
+                                )
+                with gr.Row():
+                    gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
+                with gr.Row():
+                    with gr.Column():
+                        model_name_textbox = gr.Textbox(label="Model name")
+                submit_button = gr.Button("Submit Eval")
+                submission_result = gr.Markdown()
+                submit_button.click(
+                    add_new_eval,
+                    [
+                        model_name_textbox,
+                    ],
+                    submission_result,
+                )
+        with gr.Row():
+            with gr.Accordion("📙 Citation", open=False):
+                citation_button = gr.Textbox(
+                    value=CITATION_BUTTON_TEXT,
+                    label=CITATION_BUTTON_LABEL,
+                    lines=20,
+                    elem_id="citation-button",
+                    show_copy_button=True,
+                )
 demo = gr.Blocks(css=custom_css)
 with demo:
+    # gr.LoginButton()
     m1 = gr.Markdown("Please login to see the leaderboard.")
+    # demo.load(show_leaderboard, inputs=None, outputs=m1)
+    show_leaderboard(None, None)
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()
+# demo.launch()

new/app.py ADDED Viewed

	@@ -0,0 +1,270 @@

+from pathlib import Path
+import json
+import gradio as gr
+from huggingface_hub import snapshot_download
+from gradio_leaderboard import Leaderboard, SelectColumns
+import pandas as pd
+from apscheduler.schedulers.background import BackgroundScheduler
+from ttsdb.benchmarks.benchmark import BenchmarkCategory
+from ttsdb import BenchmarkSuite
+from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN, TAGS
+from src.texts import LLM_BENCHMARKS_TEXT, EVALUATION_QUEUE_TEXT
+from src.css_html_js import custom_css
+def filter_dfs(tags, lb):
+    global f_b_df, f_a_df
+    is_agg = False
+    if "Environment" in lb.columns:
+        is_agg = True
+    if is_agg:
+        lb = f_a_df.copy()
+    else:
+        lb = f_b_df.copy()
+    if tags and len(lb) > 0:
+        lb = lb[lb["Tags"].apply(lambda x: any(tag in x for tag in tags))]
+    return lb
+def restart_space():
+    API.restart_space(repo_id=REPO_ID)
+def submit_eval(model_name, model_tags, web_url, hf_url, code_url, paper_url, inference_details, file_path):
+    model_id = model_name.lower().replace(" ", "_")
+    # check if model already exists
+    if Path(f"{EVAL_REQUESTS_PATH}/{model_id}.json").exists():
+        return "Model already exists in the evaluation queue"
+    # check which urls are valid
+    if web_url and not web_url.startswith("http"):
+        return "Please enter a valid URL"
+    if hf_url and not hf_url.startswith("http"):
+        return "Please enter a valid URL"
+    if code_url and not code_url.startswith("http"):
+        return "Please enter a valid URL"
+    if paper_url and not paper_url.startswith("http"):
+        return "Please enter a valid URL"
+    # move file to correct location
+    if not file_path.endswith(".tar.gz"):
+        return "Please upload a .tar.gz file"
+    Path(file_path).rename(f"{EVAL_REQUESTS_PATH}/{model_id}.tar.gz")
+    # build display name - use web_url to link text if available, and emojis for the other urls
+    display_name = model_name
+    if web_url:
+        display_name = f"[{display_name}]({web_url}) "
+    if hf_url:
+        display_name += f"[🤗]({hf_url})"
+    if code_url:
+        display_name += f"[💻]({code_url})"
+    if paper_url:
+        display_name += f"[📄]({paper_url})"
+    request_obj = {
+        "model_name": model_name,
+        "display_name": display_name,
+        "model_tags": model_tags,
+        "web_url": web_url,
+        "hf_url": hf_url,
+        "code_url": code_url,
+        "paper_url": paper_url,
+        "inference_details": inference_details,
+        "status": "pending",
+    }
+    with open(f"{EVAL_REQUESTS_PATH}/{model_id}.json", "w") as f:
+        json.dump(request_obj, f)
+    API.upload_file(
+        path_or_fileobj=f"{EVAL_REQUESTS_PATH}/{model_id}.json",
+        path_in_repo=f"{model_id}.json",
+        repo_id=QUEUE_REPO,
+        repo_type="dataset",
+        commit_message=f"Add {model_name} to evaluation queue",
+    )
+    API.upload_file(
+        path_or_fileobj=f"{EVAL_REQUESTS_PATH}/{model_id}.tar.gz",
+        path_in_repo=f"{model_id}.tar.gz",
+        repo_id=QUEUE_REPO,
+        repo_type="dataset",
+        commit_message=f"Add {model_name} to evaluation queue",
+    )
+    return "Model submitted successfully 🎉"
+### Space initialisation
+try:
+    print(EVAL_REQUESTS_PATH)
+    snapshot_download(
+        repo_id=QUEUE_REPO,
+        local_dir=EVAL_REQUESTS_PATH,
+        repo_type="dataset",
+        tqdm_class=None,
+        etag_timeout=30,
+        token=TOKEN,
+    )
+except Exception:
+    restart_space()
+try:
+    print(EVAL_RESULTS_PATH)
+    snapshot_download(
+        repo_id=RESULTS_REPO,
+        local_dir=EVAL_RESULTS_PATH,
+        repo_type="dataset",
+        tqdm_class=None,
+        etag_timeout=30,
+        token=TOKEN,
+    )
+except Exception:
+    restart_space()
+results_df = pd.read_csv(EVAL_RESULTS_PATH + "/results.csv")
+agg_df = BenchmarkSuite.aggregate_df(results_df)
+agg_df = agg_df.pivot(index="dataset", columns="benchmark_category", values="score")
+agg_df.rename(columns={"OVERALL": "General"}, inplace=True)
+agg_df.columns = [x.capitalize() for x in agg_df.columns]
+agg_df["Mean"] = agg_df.mean(axis=1)
+# make sure mean is the first column
+agg_df = agg_df[["Mean"] + [col for col in agg_df.columns if col != "Mean"]]
+for col in agg_df.columns:
+    agg_df[col] = agg_df[col].apply(lambda x: round(x, 2))
+agg_df["Tags"] = ""
+agg_df.reset_index(inplace=True)
+agg_df.rename(columns={"dataset": "Model"}, inplace=True)
+agg_df.sort_values("Mean", ascending=False, inplace=True)
+benchmark_df = results_df.pivot(index="dataset", columns="benchmark_name", values="score")
+# get benchmark name order by category
+benchmark_order = list(results_df.sort_values("benchmark_category")["benchmark_name"].unique())
+benchmark_df = benchmark_df[benchmark_order]
+benchmark_df = benchmark_df.reset_index()
+benchmark_df.rename(columns={"dataset": "Model"}, inplace=True)
+# set index
+benchmark_df.set_index("Model", inplace=True)
+benchmark_df["Mean"] = benchmark_df.mean(axis=1)
+# make sure mean is the first column
+benchmark_df = benchmark_df[["Mean"] + [col for col in benchmark_df.columns if col != "Mean"]]
+# round all
+for col in benchmark_df.columns:
+    benchmark_df[col] = benchmark_df[col].apply(lambda x: round(x, 2))
+benchmark_df["Tags"] = ""
+benchmark_df.reset_index(inplace=True)
+benchmark_df.sort_values("Mean", ascending=False, inplace=True)
+# get details for each model
+model_detail_files = Path(EVAL_REQUESTS_PATH).glob("*.json")
+model_details = {}
+for model_detail_file in model_detail_files:
+    with open(model_detail_file) as f:
+        model_detail = json.load(f)
+    model_details[model_detail_file.stem] = model_detail
+# replace .tar.gz
+benchmark_df["Model"] = benchmark_df["Model"].apply(lambda x: x.replace(".tar.gz", ""))
+agg_df["Model"] = agg_df["Model"].apply(lambda x: x.replace(".tar.gz", ""))
+benchmark_df["Tags"] = benchmark_df["Model"].apply(lambda x: model_details.get(x, {}).get("model_tags", ""))
+agg_df["Tags"] = agg_df["Model"].apply(lambda x: model_details.get(x, {}).get("model_tags", ""))
+benchmark_df["Model"] = benchmark_df["Model"].apply(lambda x: model_details.get(x, {}).get("display_name", x))
+agg_df["Model"] = agg_df["Model"].apply(lambda x: model_details.get(x, {}).get("display_name", x))
+f_b_df = benchmark_df.copy()
+f_a_df = agg_df.copy()
+def init_leaderboard(dataframe):
+    if dataframe is None or dataframe.empty:
+        raise ValueError("Leaderboard DataFrame is empty or None.")
+    df_types = []
+    for col in dataframe.columns:
+        if col == "Model":
+            df_types.append("markdown")
+        elif col == "Tags":
+            df_types.append("markdown")
+        else:
+            df_types.append("number")
+    return Leaderboard(
+        value=dataframe,
+        select_columns=SelectColumns(
+            default_selection=list(dataframe.columns),
+            cant_deselect=["Model", "Mean"],
+            label="Select Columns to Display:",
+        ),
+        search_columns=["Model", "Tags"],
+        filter_columns=[],
+        hide_columns=["Tags"],
+        interactive=False,
+        datatype=df_types,
+    )
+app = gr.Blocks(css=custom_css, title="TTS Benchmark Leaderboard")
+with app:
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 TTSDB Scores", elem_id="llm-benchmark-tab-table", id=0):
+            tags = gr.Dropdown(
+                TAGS,
+                value=[],
+                multiselect=True,
+                label="Tags",
+                info="Select tags to filter the leaderboard. You can suggest new tags here: https://huggingface.co/spaces/ttsds/benchmark/discussions/1",
+            )
+            leaderboard = init_leaderboard(f_a_df)
+            tags.change(filter_dfs, [tags, leaderboard], [leaderboard])
+        with gr.TabItem("🏅 Individual Benchmarks", elem_id="llm-benchmark-tab-table", id=1):
+            tags = gr.Dropdown(
+                TAGS,
+                value=[],
+                multiselect=True,
+                label="Tags",
+                info="Select tags to filter the leaderboard",
+            )
+            leaderboard = init_leaderboard(f_b_df)
+            tags.change(filter_dfs, [tags, leaderboard], [leaderboard])
+        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
+            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🚀 Submit here!", elem_id="llm-benchmark-tab-table", id=3):
+            with gr.Column():
+                with gr.Row():
+                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+                with gr.Row():
+                    gr.Markdown("# ✉️✨ Submit a TTS dataset here!", elem_classes="markdown-text")
+                with gr.Row():
+                    with gr.Column():
+                        model_name_textbox = gr.Textbox(label="Model name")
+                        model_tags_dropdown = gr.Dropdown(
+                            label="Model tags",
+                            choices=TAGS,
+                            multiselect=True,
+                        )
+                        website_url_textbox = gr.Textbox(label="Website URL (optional)")
+                        hf_url_textbox = gr.Textbox(label="Huggingface URL (optional)")
+                        code_url_textbox = gr.Textbox(label="Code URL (optional)")
+                        paper_url_textbox = gr.Textbox(label="Paper URL (optional)")
+                        inference_details_textbox = gr.TextArea(label="Inference details (optional)")
+                        file_input = gr.File(file_types=[".gz"], interactive=True, label=".tar.gz TTS dataset")
+                        submit_button = gr.Button("Submit Eval")
+                        submission_result = gr.Markdown()
+                        submit_button.click(
+                            submit_eval,
+                            [
+                                model_name_textbox,
+                                model_tags_dropdown,
+                                website_url_textbox,
+                                hf_url_textbox,
+                                code_url_textbox,
+                                paper_url_textbox,
+                                inference_details_textbox,
+                                file_input,
+                            ],
+                            submission_result,
+                        )
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=1800)
+scheduler.start()
+app.queue(default_concurrency_limit=40).launch()

new/requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+APScheduler
+black
+datasets
+gradio
+gradio[oauth]
+gradio_leaderboard==0.0.9
+gradio_client
+huggingface-hub>=0.18.0
+matplotlib
+numpy
+pandas
+python-dateutil
+tqdm
+transformers
+tokenizers>=0.15.0
+sentencepiece
+markdown

new/src/css_html_js.py ADDED Viewed

	@@ -0,0 +1,101 @@

+custom_css = """
+.markdown-text {
+    font-size: 16px !important;
+}
+#models-to-add-text {
+    font-size: 18px !important;
+}
+#citation-button span {
+    font-size: 16px !important;
+}
+#citation-button textarea {
+    font-size: 16px !important;
+}
+#citation-button > label > button {
+    margin: 6px;
+    transform: scale(1.3);
+}
+#leaderboard-table {
+    margin-top: 15px
+}
+#leaderboard-table-lite {
+    margin-top: 15px
+}
+#search-bar-table-box > div:first-child {
+    background: none;
+    border: none;
+}
+#search-bar {
+    padding: 0px;
+}
+/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
+table td:first-child,
+table th:first-child {
+    max-width: 400px;
+    overflow: auto;
+    white-space: nowrap;
+}
+.tab-buttons button {
+    font-size: 20px;
+}
+#scale-logo {
+    border-style: none !important;
+    box-shadow: none;
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+    max-width: 600px;
+}
+#scale-logo .download {
+    display: none;
+}
+#filter_type{
+    border: 0;
+    padding-left: 0;
+    padding-top: 0;
+}
+#filter_type label {
+    display: flex;
+}
+#filter_type label > span{
+    margin-top: var(--spacing-lg);
+    margin-right: 0.5em;
+}
+#filter_type label > .wrap{
+    width: 103px;
+}
+#filter_type label > .wrap .wrap-inner{
+    padding: 2px;
+}
+#filter_type label > .wrap .wrap-inner input{
+    width: 1px
+}
+#filter-columns-type{
+    border:0;
+    padding:0.5;
+}
+#filter-columns-size{
+    border:0;
+    padding:0.5;
+}
+#box-filter > .form{
+    border: 0
+}
+.svelte-1m1obck:nth-of-type(2) {
+    display: none !important;
+}
+"""

new/src/envs.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import os
+from huggingface_hub import HfApi
+# Info to change for your repository
+# ----------------------------------
+TOKEN = os.environ.get("TOKEN")  # A read/write token for your org
+OWNER = "ttsds"  # Change to your org - don't forget to create a results and request dataset, with the correct format!
+# ----------------------------------
+REPO_ID = f"{OWNER}/leaderboard"
+QUEUE_REPO = f"{OWNER}/requests"
+RESULTS_REPO = f"{OWNER}/results"
+# If you setup a cache later, just change HF_HOME
+CACHE_PATH = os.getenv("HF_HOME", ".")
+# Local caches
+EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
+EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
+EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
+EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
+API = HfApi(token=TOKEN)
+TAGS = [
+    "Normalizing Flow",
+    "Reference-based (Speaker)",
+    "Prompt-based (Speaker)",
+    "Prosodic Correlates",
+    "Adversarial",
+    "Diffusion",
+    "Audio Tokens",
+    "Autoregressive",
+    "Non-autoregressive",
+    "Pretrained Text Encoder",
+]

new/src/texts.py ADDED Viewed

	@@ -0,0 +1,37 @@

+LLM_BENCHMARKS_TEXT = f"""
+## How it works
+## Reproducibility
+To reproduce our results, check out our repository [here](https://github.com/ttsds/ttsds).
+"""
+EVALUATION_QUEUE_TEXT = """
+## How to submit a TTS model to the leaderboard
+### 1) download the evaluation dataset
+The evaluation dataset consists of wav / text pairs.
+You can download it [here](https://huggingface.co/ttsds/eval).
+The format of the dataset is as follows:
+```
+eval/
+├── 0001.wav
+├── 0001.txt
+├── 0002.wav
+├── 0002.txt
+├── ...
+```
+### 2) create your TTS dataset
+Create a dataset with your TTS model and the evaluation dataset.
+Use the wav files as speaker reference and the text as the prompt.
+Create a .tar.gz file with the dataset, and make sure to inlcude .wav files and .txt files.
+### 3) submit your TTS dataset
+Submit your dataset below.
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""
+"""

src/about.py CHANGED Viewed

@@ -1,23 +1,25 @@
 from dataclasses import dataclass
 from enum import Enum
 @dataclass
 class Task:
     benchmark: str
     metric: str
     col_name: str
 # Select your tasks here
 # ---------------------------------------------------
 class Tasks(Enum):
-    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("anli_r1", "acc", "ANLI")
-    task1 = Task("logiqa", "acc_norm", "LogiQA")
-NUM_FEWSHOT = 0 # Change with your few shot
-# ---------------------------------------------------
 # Your leaderboard name
@@ -33,38 +35,24 @@ LLM_BENCHMARKS_TEXT = f"""
 ## How it works
 ## Reproducibility
-To reproduce our results, here is the commands you can run:
 """
 EVALUATION_QUEUE_TEXT = """
-## Some good practices before submitting a model
-### 1) Make sure you can load your model and tokenizer using AutoClasses:
-```python
-from transformers import AutoConfig, AutoModel, AutoTokenizer
-config = AutoConfig.from_pretrained("your model name", revision=revision)
-model = AutoModel.from_pretrained("your model name", revision=revision)
-tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
-```
-If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
-Note: make sure your model is public!
-Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
-### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
-It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
-### 3) Make sure your model has an open license!
-This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
-### 4) Fill up your model card
-When we add extra information about models to the leaderboard, it will be automatically taken from the model card
-## In case of model failure
-If your model is displayed in the `FAILED` category, its execution stopped.
-Make sure you have followed the above steps first.
-If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"

 from dataclasses import dataclass
 from enum import Enum
 @dataclass
 class Task:
     benchmark: str
     metric: str
     col_name: str
+    category: str
 # Select your tasks here
 # ---------------------------------------------------
 class Tasks(Enum):
+    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    task0 = Task("anli_r1", "acc", "ANLI", "")
+    task1 = Task("logiqa", "acc_norm", "LogiQA", "")
+NUM_FEWSHOT = 0  # Change with your few shot
+# ---------------------------------------------------
 # Your leaderboard name
 ## How it works
 ## Reproducibility
+To reproduce our results, check out our repository [here](https://github.com/ttsds/ttsds).
 """
 EVALUATION_QUEUE_TEXT = """
+## How to submit a TTS model to the leaderboard
+### 1) download the evaluation dataset
+The evaluation dataset consists of wav / text pairs.
+You can download it [here](https://huggingface.co/ttsds/eval).
+### 2) create your TTS dataset
+Create a dataset with your TTS model and the evaluation dataset.
+Use the wav files as speaker reference and the text as the prompt.
+Create a .tar.gz file with the dataset, and make sure to inlcude .wav files and .txt files.
+### 3) submit your TTS dataset
+Submit your dataset below.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"

src/display/utils.py CHANGED Viewed

@@ -22,32 +22,22 @@ class ColumnContent:
     never_hidden: bool = False
-@dataclass
 class AutoEvalColumn:
-    model_type_symbol = ColumnContent("model_type_symbol", "str", True, never_hidden=True)
     model = ColumnContent("model", "markdown", True, never_hidden=True)
     average = ColumnContent("average", "number", True)
-    anli = ColumnContent("ANLI", "number", True)
-    logiqa = ColumnContent("LogiQA", "number", True)
-    model_type = ColumnContent("model_type", "str", False)
-    architecture = ColumnContent("architecture", "str", False)
-    weight_type = ColumnContent("weight_type", "str", False, True)
-    precision = ColumnContent("precision", "str", False)
-    license = ColumnContent("license", "str", False)
-    params = ColumnContent("#Params (B)", "number", False)
-    likes = ColumnContent("Hub ❤️", "number", False)
-    still_on_hub = ColumnContent("Available on the hub", "bool", False)
-    revision = ColumnContent("Model sha", "str", False, False)
 ## For the queue columns in the submission tab
 @dataclass(frozen=True)
 class EvalQueueColumn:  # Queue column
     model = ColumnContent("model", "markdown", True)
-    revision = ColumnContent("revision", "str", True)
-    private = ColumnContent("private", "bool", True)
-    precision = ColumnContent("precision", "str", True)
-    weight_type = ColumnContent("weight_type", "str", "Original")
     status = ColumnContent("status", "str", True)
@@ -59,64 +49,10 @@ class ModelDetails:
     symbol: str = ""  # emoji
-class ModelType(Enum):
-    PT = ModelDetails(name="pretrained", symbol="🟢")
-    FT = ModelDetails(name="fine-tuned", symbol="🔶")
-    IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
-    RL = ModelDetails(name="RL-tuned", symbol="🟦")
-    Unknown = ModelDetails(name="", symbol="?")
-    def to_str(self, separator=" "):
-        return f"{self.value.symbol}{separator}{self.value.name}"
-    @staticmethod
-    def from_str(type):
-        if "fine-tuned" in type or "🔶" in type:
-            return ModelType.FT
-        if "pretrained" in type or "🟢" in type:
-            return ModelType.PT
-        if "RL-tuned" in type or "🟦" in type:
-            return ModelType.RL
-        if "instruction-tuned" in type or "⭕" in type:
-            return ModelType.IFT
-        return ModelType.Unknown
-class WeightType(Enum):
-    Adapter = ModelDetails("Adapter")
-    Original = ModelDetails("Original")
-    Delta = ModelDetails("Delta")
-class Precision(Enum):
-    float16 = ModelDetails("float16")
-    bfloat16 = ModelDetails("bfloat16")
-    float32 = ModelDetails("float32")
-    # qt_8bit = ModelDetails("8bit")
-    # qt_4bit = ModelDetails("4bit")
-    # qt_GPTQ = ModelDetails("GPTQ")
-    Unknown = ModelDetails("?")
-    def from_str(precision):
-        if precision in ["torch.float16", "float16"]:
-            return Precision.float16
-        if precision in ["torch.bfloat16", "bfloat16"]:
-            return Precision.bfloat16
-        if precision in ["float32"]:
-            return Precision.float32
-        # if precision in ["8bit"]:
-        #    return Precision.qt_8bit
-        # if precision in ["4bit"]:
-        #    return Precision.qt_4bit
-        # if precision in ["GPTQ", "None"]:
-        #    return Precision.qt_GPTQ
-        return Precision.Unknown
 # Column selection
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
-BENCHMARK_COLS = [t.value.col_name for t in Tasks]

     never_hidden: bool = False
+@dataclass(frozen=True)
 class AutoEvalColumn:
     model = ColumnContent("model", "markdown", True, never_hidden=True)
     average = ColumnContent("average", "number", True)
+    general = ColumnContent("general", "number", True)
+    speaker = ColumnContent("speaker", "number", True)
+    prosody = ColumnContent("prosody", "number", True)
+    intelligibility = ColumnContent("intelligibility", "number", True)
+    environment = ColumnContent("environment", "number", True)
+    tags = ColumnContent("tags", "str", False)
 ## For the queue columns in the submission tab
 @dataclass(frozen=True)
 class EvalQueueColumn:  # Queue column
     model = ColumnContent("model", "markdown", True)
     status = ColumnContent("status", "str", True)
     symbol: str = ""  # emoji
 # Column selection
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
+BENCHMARK_COLS = ["general", "speaker", "prosody", "intelligibility", "environment"]

src/leaderboard/read_evals.py CHANGED Viewed

@@ -8,28 +8,16 @@ import dateutil
 import numpy as np
 from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 @dataclass
 class EvalResult:
     """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
-    eval_name: str  # org_model_precision (uid)
-    full_model: str  # org/model (path on hub)
-    org: str
-    model: str
-    revision: str  # commit hash, "" if main
     results: dict
-    precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
-    weight_type: WeightType = WeightType.Original  # Original or Adapter
-    architecture: str = "Unknown"
-    license: str = "?"
-    likes: int = 0
-    num_params: int = 0
     date: str = ""  # submission date of request file
-    still_on_hub: bool = False
     @classmethod
     def init_from_json_file(self, json_filepath):
@@ -39,22 +27,8 @@ class EvalResult:
         config = data.get("config")
-        # Precision
-        precision = Precision.from_str(config.get("model_dtype"))
-        # Get model and org
-        org_and_model = config.get("model_name", config.get("model_args", None))
-        org_and_model = org_and_model.split("/", 1)
-        if len(org_and_model) == 1:
-            org = None
-            model = org_and_model[0]
-            result_key = f"{model}_{precision.value.name}"
-        else:
-            org = org_and_model[0]
-            model = org_and_model[1]
-            result_key = f"{org}_{model}_{precision.value.name}"
-        full_model = "/".join(org_and_model)
         # Extract results available in this file (some results are split in several files)
         results = {}
@@ -70,28 +44,19 @@ class EvalResult:
             results[task.benchmark] = mean_acc
         return self(
-            eval_name=result_key,
-            full_model=full_model,
-            org=org,
-            model=model,
             results=results,
-            precision=precision,
-            revision=config.get("model_sha", ""),
         )
     def update_with_request_file(self, requests_path):
         """Finds the relevant request file for the current model and updates info with it"""
-        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
         try:
             with open(request_file, "r") as f:
                 request = json.load(f)
-            self.model_type = ModelType.from_str(request.get("model_type", ""))
-            self.weight_type = WeightType[request.get("weight_type", "Original")]
-            self.license = request.get("license", "?")
-            self.likes = request.get("likes", 0)
-            self.num_params = request.get("params", 0)
-            self.date = request.get("submitted_time", "")
         except Exception:
             print(
                 f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
@@ -99,30 +64,11 @@ class EvalResult:
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
-        average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         data_dict = {
-            "eval_name": self.eval_name,  # not a column, just a save name,
-            AutoEvalColumn.precision.name: self.precision.value.name,
-            AutoEvalColumn.model_type.name: self.model_type.value.name,
-            AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
-            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
-            AutoEvalColumn.architecture.name: self.architecture,
-            AutoEvalColumn.model.name: make_clickable_model(self.full_model),
-            AutoEvalColumn.revision.name: self.revision,
-            AutoEvalColumn.average.name: average,
-            AutoEvalColumn.license.name: self.license,
-            AutoEvalColumn.likes.name: self.likes,
-            AutoEvalColumn.params.name: self.num_params,
-            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
-        }
-        for task in Tasks:
-            data_dict[task.value.col_name] = self.results[task.value.benchmark]
-        return data_dict
-def get_request_file_for_model(requests_path, model_name, precision):
     """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
     request_files = os.path.join(
         requests_path,
@@ -130,13 +76,13 @@ def get_request_file_for_model(requests_path, model_name, precision):
     )
     request_files = glob.glob(request_files)
-    # Select correct request file (precision)
     request_file = ""
     request_files = sorted(request_files, reverse=True)
     for tmp_request_file in request_files:
         with open(tmp_request_file, "r") as f:
             req_content = json.load(f)
-            if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
                 request_file = tmp_request_file
     return request_file

 import numpy as np
 from src.display.formatting import make_clickable_model
+from src.display.utils import AutoEvalColumn, Tasks
 @dataclass
 class EvalResult:
     """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
+    model_id: str
     results: dict
     date: str = ""  # submission date of request file
     @classmethod
     def init_from_json_file(self, json_filepath):
         config = data.get("config")
+        # Extract model info
+        model = config.get("model_name", "")
         # Extract results available in this file (some results are split in several files)
         results = {}
             results[task.benchmark] = mean_acc
         return self(
+            model_id=model,
             results=results,
         )
     def update_with_request_file(self, requests_path):
         """Finds the relevant request file for the current model and updates info with it"""
+        request_file = get_request_file_for_model(requests_path, self.full_model)
         try:
             with open(request_file, "r") as f:
                 request = json.load(f)
+            self.model_id = request.get("model", self.model_id)
+            self.results
         except Exception:
             print(
                 f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         data_dict = {
+def get_request_file_for_model(requests_path, model_name):
     """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
     request_files = os.path.join(
         requests_path,
     )
     request_files = glob.glob(request_files)
+    # Select correct request file
     request_file = ""
     request_files = sorted(request_files, reverse=True)
     for tmp_request_file in request_files:
         with open(tmp_request_file, "r") as f:
             req_content = json.load(f)
+            if req_content["status"] in ["FINISHED"]:
                 request_file = tmp_request_file
     return request_file

src/submission/submit.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 import os
 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
 from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
@@ -13,7 +14,7 @@ USERS_TO_SUBMISSION_DATES = None
 def add_new_eval(
     model: str,
-    revision: str,
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
@@ -34,7 +35,6 @@ def add_new_eval(
     eval_entry = {
         "model": model,
-        "revision": revision,
         "status": "PENDING",
         "submitted_time": current_time,
         "private": False,
@@ -47,7 +47,7 @@ def add_new_eval(
     print("Creating eval file")
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
-    out_path = f"{OUT_DIR}/{model_name}_eval_request_False_{precision}_{weight_type}.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))

 import json
 import os
 from datetime import datetime, timezone
+from typing import List
 from src.display.formatting import styled_error, styled_message, styled_warning
 from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
 def add_new_eval(
     model: str,
+    tags: List[str],
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
     eval_entry = {
         "model": model,
         "status": "PENDING",
         "submitted_time": current_time,
         "private": False,
     print("Creating eval file")
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
+    out_path = f"{OUT_DIR}/{model_name}_eval_request_False.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))