Spaces:

autoevaluate
/

model-evaluator

Runtime error

App Files Files Community

lewtun HF Staff commited on Jun 1, 2022

Commit

ffb1f74

unverified ·

2 Parent(s): dd6f307 faca395

Merge pull request #13 from huggingface/feat/select_metrics

Browse files

Files changed (2) hide show

app.py +76 -2
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import os
 import uuid
 from pathlib import Path
 import pandas as pd
 import streamlit as st
-from datasets import get_dataset_config_names
 from dotenv import load_dotenv
 from huggingface_hub import list_datasets
 from evaluation import filter_evaluated_models
 from utils import (
@@ -37,9 +39,61 @@ TASK_TO_ID = {
     "summarization": 8,
 }
 SUPPORTED_TASKS = list(TASK_TO_ID.keys())
 #######
 # APP #
 #######
@@ -256,6 +310,26 @@ with st.expander("Advanced configuration"):
 with st.form(key="form"):
     compatible_models = get_compatible_models(selected_task, selected_dataset)
     selected_models = st.multiselect("Select the models you wish to evaluate", compatible_models)
     print("Selected models:", selected_models)
@@ -291,7 +365,7 @@ with st.form(key="form"):
                         "disk_size_gb": 150,
                     },
                     "evaluation": {
-                        "metrics": [],
                         "models": selected_models,
                     },
                 },

+import inspect
 import os
 import uuid
 from pathlib import Path
 import pandas as pd
 import streamlit as st
+from datasets import get_dataset_config_names, list_metrics, load_metric
 from dotenv import load_dotenv
 from huggingface_hub import list_datasets
+from tqdm import tqdm
 from evaluation import filter_evaluated_models
 from utils import (
     "summarization": 8,
 }
+TASK_TO_DEFAULT_METRICS = {
+    "binary_classification": ["f1", "precision", "recall", "auc", "accuracy"],
+    "multi_class_classification": [
+        "f1_micro",
+        "f1_macro",
+        "f1_weighted",
+        "precision_macro",
+        "precision_micro",
+        "precision_weighted",
+        "recall_macro",
+        "recall_micro",
+        "recall_weighted",
+        "accuracy",
+    ],
+    "entity_extraction": ["precision", "recall", "f1", "accuracy"],
+    "extractive_question_answering": [],
+    "translation": ["sacrebleu", "gen_len"],
+    "summarization": ["rouge1", "rouge2", "rougeL", "rougeLsum", "gen_len"],
+}
 SUPPORTED_TASKS = list(TASK_TO_ID.keys())
+@st.cache
+def get_supported_metrics():
+    metrics = list_metrics()
+    supported_metrics = []
+    for metric in tqdm(metrics):
+        try:
+            metric_func = load_metric(metric)
+        except Exception as e:
+            print(e)
+            print("Skipping the following metric, which cannot load:", metric)
+        argspec = inspect.getfullargspec(metric_func.compute)
+        if "references" in argspec.kwonlyargs and "predictions" in argspec.kwonlyargs:
+            # We require that "references" and "predictions" are arguments
+            # to the metric function. We also require that the other arguments
+            # besides "references" and "predictions" have defaults and so do not
+            # need to be specified explicitly.
+            defaults = True
+            for key, value in argspec.kwonlydefaults.items():
+                if key not in ("references", "predictions"):
+                    if value is None:
+                        defaults = False
+                        break
+            if defaults:
+                supported_metrics.append(metric)
+    return supported_metrics
+supported_metrics = get_supported_metrics()
 #######
 # APP #
 #######
 with st.form(key="form"):
     compatible_models = get_compatible_models(selected_task, selected_dataset)
+    st.markdown("The following metrics will be computed")
+    html_string = " ".join(
+        [
+            '<div style="padding-right:5px;padding-left:5px;padding-top:5px;padding-bottom:5px;float:left">'
+            + '<div style="background-color:#D3D3D3;border-radius:5px;display:inline-block;padding-right:5px;'
+            + 'padding-left:5px;color:white">'
+            + metric
+            + "</div></div>"
+            for metric in TASK_TO_DEFAULT_METRICS[selected_task]
+        ]
+    )
+    st.markdown(html_string, unsafe_allow_html=True)
+    selected_metrics = st.multiselect(
+        "(Optional) Select additional metrics",
+        list(set(supported_metrics) - set(TASK_TO_DEFAULT_METRICS[selected_task])),
+    )
+    st.info(
+        "Note: user-selected metrics will be run with their default arguments from "
+        + "[here](https://github.com/huggingface/datasets/tree/master/metrics)"
+    )
     selected_models = st.multiselect("Select the models you wish to evaluate", compatible_models)
     print("Selected models:", selected_models)
                         "disk_size_gb": 150,
                     },
                     "evaluation": {
+                        "metrics": selected_metrics,
                         "models": selected_models,
                     },
                 },

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 huggingface-hub==0.4.0
 python-dotenv
 streamlit==1.2.0
 py7zr

 huggingface-hub==0.4.0
 python-dotenv
 streamlit==1.2.0
+datasets
 py7zr