John Graham Reynolds
took notes from demos for another attempt
9b29e93
raw
history blame
3.25 kB
from fixed_f1 import FixedF1
from fixed_precision import FixedPrecision
from fixed_recall import FixedRecall
import evaluate
import gradio as gr
import pandas as pd
title = "'Combine' multiple metrics with this πŸ€— Evaluate πŸͺ² Fix!"
description = """<p style='text-align: center'>
As I introduce myself to the entirety of the πŸ€— ecosystem, I've put together this Space to show off a temporary fix for a current πŸͺ² in the πŸ€— Evaluate library. \n
Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \
'evaluate.combine()' multiple metrics related to multilabel text classification. Particularly, one cannot 'combine()' the f1, precision, and recall scores for \
evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \
text classification of 805 labeled medical conditions based on drug reviews. \n
This Space shows how one can instantiate these custom metrics each with their own unique methodology for averaging across labels, combine them into a single
HF `evaluate.EvaluationModule` (or `Metric`), and compute them.</p>
"""
article = "<p style='text-align: center'>Check out the [original repo](https://github.com/johngrahamreynolds/FixedMetricsForHF) housing this code, and a quickly \
trained [multilabel text classification model](https://github.com/johngrahamreynolds/RoBERTa-base-DReiFT/tree/main) that makes use of it during evaluation.</p>"
def evaluation(predictions, metrics) -> str:
f1 = FixedF1(average=metrics["f1"])
precision = FixedPrecision(average=metrics["precision"])
recall = FixedRecall(average=metrics["recall"])
combined = evaluate.combine([f1, recall, precision])
df = predictions.get_dataframe()
predicted = df["Predicted Label"].to_list()
references = df["Actual Label"].to_list()
combined.add_batch(prediction=predicted, reference=references)
outputs = combined.compute()
return "Your metrics are as follows: \n" + outputs
# gr.Interface(
# fn=show_off,
# inputs=gr.Dataframe(type="array", datatype="number", row_count=5, col_count=1),
# outputs="text",
# title=title,
# description=description,
# article=article,
# examples=[pd.DataFrame([1, 0, 2, 0, 1])],
# cache_examples=False
# ).launch()
# use this to create examples
# data = {'Name':['Tony', 'Steve', 'Bruce', 'Peter' ],
# 'Age': [35, 70, 45, 20] }
# # Creating DataFrame
# df = pd.DataFrame(data)
def filter_records(records, gender):
return records[records["gender"] == gender]
space = gr.Interface(
fn=evaluation,
inputs=[
gr.Dataframe(
headers=["Predicted Label", "Actual Label"],
datatype=["number", "number"],
row_count=5,
col_count=(2, "fixed"),
),
gr.Dataframe(
headers=["Metric", "Averaging Type"],
datatype=["str", "str"],
row_count=3,
col_count=(2, "fixed"),
)
],
outputs="textbox",
title=title,
description=description,
article=article,
cache_examples=False
).launch()