Spaces:

autoevaluate
/

leaderboards

Runtime error

App Files Files Community

Tristan Thrush commited on Jul 1, 2022

Commit

888432c

1 Parent(s): 79668b2

added enforcement for known metric ranges

Browse files

Files changed (3) hide show

app.py +13 -2
ascending_metrics.py +0 -10
utils.py +38 -0

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import pandas as pd
 import streamlit as st
 from huggingface_hub import HfApi, hf_hub_download
 from huggingface_hub.repocard import metadata_load
-from ascending_metrics import ascending_metrics
 import numpy as np
 from st_aggrid import AgGrid, GridOptionsBuilder, JsCode
 from os.path import exists
@@ -46,6 +46,7 @@ def parse_metrics_rows(meta, only_verified=False):
         if "config" in result["dataset"]:
             row["config"] = result["dataset"]["config"]
         no_results = True
         for metric in result["metrics"]:
             name = metric["type"].lower().strip()
@@ -64,10 +65,16 @@ def parse_metrics_rows(meta, only_verified=False):
                     if "verified" in metric and metric["verified"]:
                         no_results = False
                         row[name] = value
                 else:
                     no_results = False
                     row[name] = value
-        if no_results:
             continue
         yield row
@@ -199,6 +206,10 @@ if len(dataset_df) > 0:
         "Want to beat the leaderboard? Don't see your model here? Simply request an automatic evaluation [here](https://huggingface.co/spaces/autoevaluate/model-evaluator)."
     )
     # Make the default metric appear right after model names
     cols = dataset_df.columns.tolist()
     cols.remove(sorting_metric)

 import streamlit as st
 from huggingface_hub import HfApi, hf_hub_download
 from huggingface_hub.repocard import metadata_load
+from utils import ascending_metrics, metric_ranges
 import numpy as np
 from st_aggrid import AgGrid, GridOptionsBuilder, JsCode
 from os.path import exists
         if "config" in result["dataset"]:
             row["config"] = result["dataset"]["config"]
         no_results = True
+        incorrect_results = False
         for metric in result["metrics"]:
             name = metric["type"].lower().strip()
                     if "verified" in metric and metric["verified"]:
                         no_results = False
                         row[name] = value
+                        if name in metric_ranges:
+                            if value < metric_ranges[name][0] or value > metric_ranges[name][1]:
+                                incorrect_results = True
                 else:
                     no_results = False
                     row[name] = value
+                    if name in metric_ranges:
+                        if value < metric_ranges[name][0] or value > metric_ranges[name][1]:
+                            incorrect_results = True
+        if no_results or incorrect_results:
             continue
         yield row
         "Want to beat the leaderboard? Don't see your model here? Simply request an automatic evaluation [here](https://huggingface.co/spaces/autoevaluate/model-evaluator)."
     )
+    st.markdown(
+        "Note: if you do not see your self-reported results here, ensure that your results are in the expected range for all metrics. E.g., accuracy is 0-1, not 0-100."
+    )
     # Make the default metric appear right after model names
     cols = dataset_df.columns.tolist()
     cols.remove(sorting_metric)

ascending_metrics.py DELETED Viewed

@@ -1,10 +0,0 @@
-ascending_metrics = {
-    "wer",
-    "cer",
-    "loss",
-    "mae",
-    "mahalanobis",
-    "mse",
-    "perplexity",
-    "ter",
-}

utils.py ADDED Viewed

	@@ -0,0 +1,38 @@

+ascending_metrics = {
+    "wer",
+    "cer",
+    "loss",
+    "mae",
+    "mahalanobis",
+    "mse",
+    "perplexity",
+    "ter",
+}
+metric_ranges = {
+    "accuracy": (0,1),
+    "precision": (0,1),
+    "recall": (0,1),
+    "f1": (0,1),
+    "macro f1": (0,1),
+    "micro f1": (0,1),
+    "cer": (0,1),
+    "wer": (0,1),
+    "pearson": (-1, 1),
+    "matthews_correlation": (-1, 1),
+    "spearmanr": (-1, 1),
+    "google_bleu": (0, 1),
+    "precision@10": (0, 1),
+    "mae": (0, 1),
+    "mauve": (0, 1),
+    "frontier_integral": (0, 1),
+    "mean_iou": (0, 1),
+    "mean_accuracy": (0, 1),
+    "overall_accuracy": (0, 1),
+    "meteor": (0, 1),
+    "mse": (0, 1),
+    "perplexity": (0, float("inf")),
+    "rogue1": (0, 1),
+    "rogue2": (0, 1),
+    "sari": (0, 100),
+}