File size: 4,363 Bytes
b71d998 1847cde b71d998 288a6ca b71d998 bcbab79 9230b05 9b29e93 9230b05 9b29e93 217c111 9230b05 bcbab79 41d497e 6fdccff dbc5be0 b71d998 dbc5be0 1847cde dbc5be0 1847cde dbc5be0 b71d998 86e4acf 9230b05 41d497e dbc5be0 9230b05 23e9f70 b71d998 a9c64ab 9b29e93 a9c64ab 9b29e93 a9c64ab 31ed20e 9b29e93 a9c64ab 9b29e93 9ced07d a9c64ab 9b29e93 bcbab79 9b29e93 9ced07d 9b29e93 a9c64ab bcbab79 9b29e93 bcbab79 41d497e 718583d 41d497e 1f58605 9b29e93 b71d998 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
from fixed_f1 import FixedF1
from fixed_precision import FixedPrecision
from fixed_recall import FixedRecall
import evaluate
import gradio as gr
import pandas as pd
title = "'Combine' multiple metrics with this 🤗 Evaluate 🪲 Fix!"
description = """<p style='text-align: center'>
As I introduce myself to the entirety of the 🤗 ecosystem, I've put together this Space to show off a temporary fix for a current 🪲 in the 🤗 Evaluate library. \n
Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \
`evaluate.combine()` multiple metrics related to multilabel text classification. Particularly, one cannot `combine` the `f1`, `precision`, and `recall` scores for \
evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \
text classification of 805 labeled medical conditions based on drug reviews. The [following workaround](https://github.com/johngrahamreynolds/FixedMetricsForHF) was
created to address this. \n
This Space shows how one can instantiate these custom `evaluate.Metric`s, each with their own unique methodology for averaging across labels, before `combine`-ing them into a
HF `evaluate.CombinedEvaluations` object. From here, we can easily compute each of the metrics simultaneously using `compute`.</p>
"""
article = """<p style='text-align: center'>Check out the [original repo](https://github.com/johngrahamreynolds/FixedMetricsForHF) housing this code, and a quickly \
trained [multilabel text classification model](https://github.com/johngrahamreynolds/RoBERTa-base-DReiFT/tree/main) that makes use of it during evaluation.</p>"""
def evaluation(predictions, metrics) -> str:
metric_set = set(metrics["Metric"].to_list())
combined_list = []
if "f1" in metric_set:
f1 = FixedF1(average=metrics.loc[metrics["Metric"] == "f1"]["Averaging Type"][0])
combined_list.append(f1)
if "precision" in metric_set:
precision = FixedPrecision(average=metrics.loc[metrics["Metric"] == "precision"]["Averaging Type"][0])
combined_list.append(precision)
if "recall" in metric_set:
recall = FixedRecall(average=metrics.loc[metrics["Metric"] == "recall"]["Averaging Type"][0])
combined_list.append(recall)
combined = evaluate.combine(combined_list)
predicted = [int(num) for num in predictions["Predicted Class Label"].to_list()]
references = [int(num) for num in predictions["Actual Class Label"].to_list()]
combined.add_batch(predictions=predicted, references=references)
outputs = combined.compute()
return f"Your metrics are as follows: \n {outputs}"
# gr.Interface(
# fn=show_off,
# inputs=gr.Dataframe(type="array", datatype="number", row_count=5, col_count=1),
# outputs="text",
# title=title,
# description=description,
# article=article,
# examples=[pd.DataFrame([1, 0, 2, 0, 1])],
# cache_examples=False
# ).launch()
# use this to create examples
# data = {'Name':['Tony', 'Steve', 'Bruce', 'Peter' ],
# 'Age': [35, 70, 45, 20] }
# # Creating DataFrame
# df = pd.DataFrame(data)
def filter_records(records, gender):
return records[records["gender"] == gender]
space = gr.Interface(
fn=evaluation,
inputs=[
gr.Dataframe(
headers=["Predicted Class Label", "Actual Class Label"],
datatype=["number", "number"],
row_count=5,
col_count=(2, "fixed"),
label="Table of Predicted vs Actual Class Labels"
),
gr.Dataframe(
headers=["Metric", "Averaging Type"],
datatype=["str", "str"],
row_count=(3, "fixed"),
col_count=(2, "fixed"),
label="Table of Metrics and Averaging Method across Labels "
)
],
outputs="text",
title=title,
description=description,
article=article,
examples=[
[
pd.DataFrame(columns=["Predicted Class Label", "Actual Class Label"], data=[[0,1],[1,1],[2,2],[1,0],[0,0]]),
pd.DataFrame(columns=["Metric", "Averaging Type"], data=[["f1", "weighted"],["precision", "micro"],["recall", "macro"]])
]
],
cache_examples=False
).launch() |