File size: 3,661 Bytes
b71d998 1847cde b71d998 288a6ca b71d998 bcbab79 9230b05 9b29e93 9230b05 9b29e93 217c111 9230b05 bcbab79 41d497e 6fdccff dbc5be0 b71d998 1847cde ea607d4 1847cde ea607d4 dbc5be0 ea607d4 dbc5be0 ea607d4 dbc5be0 ea607d4 dbc5be0 b71d998 ea607d4 9230b05 41d497e dbc5be0 9230b05 23e9f70 b71d998 a9c64ab 9b29e93 a9c64ab 31ed20e 9b29e93 a9c64ab 9b29e93 9ced07d a9c64ab 9b29e93 bcbab79 9b29e93 9ced07d 9b29e93 a9c64ab bcbab79 9b29e93 bcbab79 41d497e 718583d 41d497e 1f58605 9b29e93 b71d998 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
from fixed_f1 import FixedF1
from fixed_precision import FixedPrecision
from fixed_recall import FixedRecall
import evaluate
import gradio as gr
import pandas as pd
title = "'Combine' multiple metrics with this 🤗 Evaluate 🪲 Fix!"
description = """<p style='text-align: center'>
As I introduce myself to the entirety of the 🤗 ecosystem, I've put together this Space to show off a temporary fix for a current 🪲 in the 🤗 Evaluate library. \n
Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \
`evaluate.combine()` multiple metrics related to multilabel text classification. Particularly, one cannot `combine` the `f1`, `precision`, and `recall` scores for \
evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \
text classification of 805 labeled medical conditions based on drug reviews. The [following workaround](https://github.com/johngrahamreynolds/FixedMetricsForHF) was
created to address this. \n
This Space shows how one can instantiate these custom `evaluate.Metric`s, each with their own unique methodology for averaging across labels, before `combine`-ing them into a
HF `evaluate.CombinedEvaluations` object. From here, we can easily compute each of the metrics simultaneously using `compute`.</p>
"""
def populate_map(metric_df: pd.DataFrame, metric_set: set) -> dict:
metric_map = dict()
for key in metric_set:
for val in metric_df.loc[metric_df["metric"] == key]["average"]:
metric_map[key] = val
return metric_map
def evaluation(predictions_df: pd.DataFrame, metrics_df: pd.DataFrame) -> str:
metric_set = set(metrics_df["Metric"].to_list())
metric_map = populate_map(metrics_df, metric_set)
combined_list = []
if "f1" in metric_set:
f1 = FixedF1(average=metric_map["f1"])
combined_list.append(f1)
if "precision" in metric_set:
precision = FixedPrecision(average=metric_map["f1"])
combined_list.append(precision)
if "recall" in metric_set:
recall = FixedRecall(average=metric_map["f1"])
combined_list.append(recall)
combined = evaluate.combine(combined_list)
predicted = [int(num) for num in predictions_df["Predicted Class Label"].to_list()]
references = [int(num) for num in predictions_df["Actual Class Label"].to_list()]
combined.add_batch(predictions=predicted, references=references)
outputs = combined.compute()
return f"Your metrics are as follows: \n {outputs}"
space = gr.Interface(
fn=evaluation,
inputs=[
gr.Dataframe(
headers=["Predicted Class Label", "Actual Class Label"],
datatype=["number", "number"],
row_count=5,
col_count=(2, "fixed"),
label="Table of Predicted vs Actual Class Labels"
),
gr.Dataframe(
headers=["Metric", "Averaging Type"],
datatype=["str", "str"],
row_count=(3, "fixed"),
col_count=(2, "fixed"),
label="Table of Metrics and Averaging Method across Labels "
)
],
outputs="text",
title=title,
description=description,
examples=[
[
pd.DataFrame(columns=["Predicted Class Label", "Actual Class Label"], data=[[0,1],[1,1],[2,2],[1,0],[0,0]]),
pd.DataFrame(columns=["Metric", "Averaging Type"], data=[["f1", "weighted"],["precision", "micro"],["recall", "macro"]])
]
],
cache_examples=False
).launch() |