|
from fixed_f1 import FixedF1 |
|
from fixed_precision import FixedPrecision |
|
from fixed_recall import FixedRecall |
|
import evaluate |
|
import gradio as gr |
|
import pandas as pd |
|
|
|
title = "'Combine' multiple metrics with this π€ Evaluate πͺ² Fix!" |
|
|
|
description = """<p style='text-align: center'> |
|
As I introduce myself to the entirety of the π€ ecosystem, I've put together this Space to show off a temporary fix for a current πͺ² in the π€ Evaluate library. \n |
|
|
|
Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \ |
|
'evaluate.combine()' multiple metrics related to multilabel text classification. Particularly, one cannot 'combine()' the f1, precision, and recall scores for \ |
|
evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \ |
|
text classification of 805 labeled medical conditions based on drug reviews. \n |
|
|
|
This Space shows how one can instantiate these custom metrics each with their own unique methodology for averaging across labels, combine them into a single |
|
HF `evaluate.EvaluationModule` (or `Metric`), and compute them.</p> |
|
""" |
|
|
|
article = "<p style='text-align: center'>Check out the [original repo](https://github.com/johngrahamreynolds/FixedMetricsForHF) housing this code, and a quickly \ |
|
trained [multilabel text classification model](https://github.com/johngrahamreynolds/RoBERTa-base-DReiFT/tree/main) that makes use of it during evaluation.</p>" |
|
|
|
def evaluation(predictions, metrics) -> str: |
|
|
|
f1 = FixedF1(average=metrics["f1"]) |
|
precision = FixedPrecision(average=metrics["precision"]) |
|
recall = FixedRecall(average=metrics["recall"]) |
|
combined = evaluate.combine([f1, recall, precision]) |
|
|
|
df = predictions.get_dataframe() |
|
predicted = df["Predicted Label"].to_list() |
|
references = df["Actual Label"].to_list() |
|
|
|
combined.add_batch(prediction=predicted, reference=references) |
|
outputs = combined.compute() |
|
|
|
return "Your metrics are as follows: \n" + outputs |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def filter_records(records, gender): |
|
return records[records["gender"] == gender] |
|
|
|
space = gr.Interface( |
|
fn=evaluation, |
|
inputs=[ |
|
gr.Dataframe( |
|
headers=["Predicted Label", "Actual Label"], |
|
datatype=["number", "number"], |
|
row_count=5, |
|
col_count=(2, "fixed"), |
|
), |
|
gr.Dataframe( |
|
headers=["Metric", "Averaging Type"], |
|
datatype=["str", "str"], |
|
row_count=3, |
|
col_count=(2, "fixed"), |
|
) |
|
], |
|
outputs="textbox", |
|
title=title, |
|
description=description, |
|
article=article, |
|
cache_examples=False |
|
).launch() |