Spaces:
Runtime error
Runtime error
Tristan Thrush
commited on
Commit
·
888432c
1
Parent(s):
79668b2
added enforcement for known metric ranges
Browse files- app.py +13 -2
- ascending_metrics.py +0 -10
- utils.py +38 -0
app.py
CHANGED
@@ -2,7 +2,7 @@ import pandas as pd
|
|
2 |
import streamlit as st
|
3 |
from huggingface_hub import HfApi, hf_hub_download
|
4 |
from huggingface_hub.repocard import metadata_load
|
5 |
-
from
|
6 |
import numpy as np
|
7 |
from st_aggrid import AgGrid, GridOptionsBuilder, JsCode
|
8 |
from os.path import exists
|
@@ -46,6 +46,7 @@ def parse_metrics_rows(meta, only_verified=False):
|
|
46 |
if "config" in result["dataset"]:
|
47 |
row["config"] = result["dataset"]["config"]
|
48 |
no_results = True
|
|
|
49 |
for metric in result["metrics"]:
|
50 |
name = metric["type"].lower().strip()
|
51 |
|
@@ -64,10 +65,16 @@ def parse_metrics_rows(meta, only_verified=False):
|
|
64 |
if "verified" in metric and metric["verified"]:
|
65 |
no_results = False
|
66 |
row[name] = value
|
|
|
|
|
|
|
67 |
else:
|
68 |
no_results = False
|
69 |
row[name] = value
|
70 |
-
|
|
|
|
|
|
|
71 |
continue
|
72 |
yield row
|
73 |
|
@@ -199,6 +206,10 @@ if len(dataset_df) > 0:
|
|
199 |
"Want to beat the leaderboard? Don't see your model here? Simply request an automatic evaluation [here](https://huggingface.co/spaces/autoevaluate/model-evaluator)."
|
200 |
)
|
201 |
|
|
|
|
|
|
|
|
|
202 |
# Make the default metric appear right after model names
|
203 |
cols = dataset_df.columns.tolist()
|
204 |
cols.remove(sorting_metric)
|
|
|
2 |
import streamlit as st
|
3 |
from huggingface_hub import HfApi, hf_hub_download
|
4 |
from huggingface_hub.repocard import metadata_load
|
5 |
+
from utils import ascending_metrics, metric_ranges
|
6 |
import numpy as np
|
7 |
from st_aggrid import AgGrid, GridOptionsBuilder, JsCode
|
8 |
from os.path import exists
|
|
|
46 |
if "config" in result["dataset"]:
|
47 |
row["config"] = result["dataset"]["config"]
|
48 |
no_results = True
|
49 |
+
incorrect_results = False
|
50 |
for metric in result["metrics"]:
|
51 |
name = metric["type"].lower().strip()
|
52 |
|
|
|
65 |
if "verified" in metric and metric["verified"]:
|
66 |
no_results = False
|
67 |
row[name] = value
|
68 |
+
if name in metric_ranges:
|
69 |
+
if value < metric_ranges[name][0] or value > metric_ranges[name][1]:
|
70 |
+
incorrect_results = True
|
71 |
else:
|
72 |
no_results = False
|
73 |
row[name] = value
|
74 |
+
if name in metric_ranges:
|
75 |
+
if value < metric_ranges[name][0] or value > metric_ranges[name][1]:
|
76 |
+
incorrect_results = True
|
77 |
+
if no_results or incorrect_results:
|
78 |
continue
|
79 |
yield row
|
80 |
|
|
|
206 |
"Want to beat the leaderboard? Don't see your model here? Simply request an automatic evaluation [here](https://huggingface.co/spaces/autoevaluate/model-evaluator)."
|
207 |
)
|
208 |
|
209 |
+
st.markdown(
|
210 |
+
"Note: if you do not see your self-reported results here, ensure that your results are in the expected range for all metrics. E.g., accuracy is 0-1, not 0-100."
|
211 |
+
)
|
212 |
+
|
213 |
# Make the default metric appear right after model names
|
214 |
cols = dataset_df.columns.tolist()
|
215 |
cols.remove(sorting_metric)
|
ascending_metrics.py
DELETED
@@ -1,10 +0,0 @@
|
|
1 |
-
ascending_metrics = {
|
2 |
-
"wer",
|
3 |
-
"cer",
|
4 |
-
"loss",
|
5 |
-
"mae",
|
6 |
-
"mahalanobis",
|
7 |
-
"mse",
|
8 |
-
"perplexity",
|
9 |
-
"ter",
|
10 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ascending_metrics = {
|
2 |
+
"wer",
|
3 |
+
"cer",
|
4 |
+
"loss",
|
5 |
+
"mae",
|
6 |
+
"mahalanobis",
|
7 |
+
"mse",
|
8 |
+
"perplexity",
|
9 |
+
"ter",
|
10 |
+
}
|
11 |
+
|
12 |
+
metric_ranges = {
|
13 |
+
"accuracy": (0,1),
|
14 |
+
"precision": (0,1),
|
15 |
+
"recall": (0,1),
|
16 |
+
"f1": (0,1),
|
17 |
+
"macro f1": (0,1),
|
18 |
+
"micro f1": (0,1),
|
19 |
+
"cer": (0,1),
|
20 |
+
"wer": (0,1),
|
21 |
+
"pearson": (-1, 1),
|
22 |
+
"matthews_correlation": (-1, 1),
|
23 |
+
"spearmanr": (-1, 1),
|
24 |
+
"google_bleu": (0, 1),
|
25 |
+
"precision@10": (0, 1),
|
26 |
+
"mae": (0, 1),
|
27 |
+
"mauve": (0, 1),
|
28 |
+
"frontier_integral": (0, 1),
|
29 |
+
"mean_iou": (0, 1),
|
30 |
+
"mean_accuracy": (0, 1),
|
31 |
+
"overall_accuracy": (0, 1),
|
32 |
+
"meteor": (0, 1),
|
33 |
+
"mse": (0, 1),
|
34 |
+
"perplexity": (0, float("inf")),
|
35 |
+
"rogue1": (0, 1),
|
36 |
+
"rogue2": (0, 1),
|
37 |
+
"sari": (0, 100),
|
38 |
+
}
|