File size: 3,249 Bytes
b71d998 1847cde b71d998 288a6ca b71d998 9230b05 9b29e93 9230b05 9b29e93 9230b05 1847cde 6fdccff 9b29e93 b71d998 9b29e93 1847cde 9b29e93 1847cde 9b29e93 b71d998 9b29e93 9230b05 9b29e93 9230b05 9b29e93 b71d998 a9c64ab 9b29e93 a9c64ab 9b29e93 a9c64ab 9b29e93 a9c64ab 9b29e93 a9c64ab 9b29e93 a9c64ab 9b29e93 b71d998 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
from fixed_f1 import FixedF1
from fixed_precision import FixedPrecision
from fixed_recall import FixedRecall
import evaluate
import gradio as gr
import pandas as pd
title = "'Combine' multiple metrics with this 🤗 Evaluate 🪲 Fix!"
description = """<p style='text-align: center'>
As I introduce myself to the entirety of the 🤗 ecosystem, I've put together this Space to show off a temporary fix for a current 🪲 in the 🤗 Evaluate library. \n
Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \
'evaluate.combine()' multiple metrics related to multilabel text classification. Particularly, one cannot 'combine()' the f1, precision, and recall scores for \
evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \
text classification of 805 labeled medical conditions based on drug reviews. \n
This Space shows how one can instantiate these custom metrics each with their own unique methodology for averaging across labels, combine them into a single
HF `evaluate.EvaluationModule` (or `Metric`), and compute them.</p>
"""
article = "<p style='text-align: center'>Check out the [original repo](https://github.com/johngrahamreynolds/FixedMetricsForHF) housing this code, and a quickly \
trained [multilabel text classification model](https://github.com/johngrahamreynolds/RoBERTa-base-DReiFT/tree/main) that makes use of it during evaluation.</p>"
def evaluation(predictions, metrics) -> str:
f1 = FixedF1(average=metrics["f1"])
precision = FixedPrecision(average=metrics["precision"])
recall = FixedRecall(average=metrics["recall"])
combined = evaluate.combine([f1, recall, precision])
df = predictions.get_dataframe()
predicted = df["Predicted Label"].to_list()
references = df["Actual Label"].to_list()
combined.add_batch(prediction=predicted, reference=references)
outputs = combined.compute()
return "Your metrics are as follows: \n" + outputs
# gr.Interface(
# fn=show_off,
# inputs=gr.Dataframe(type="array", datatype="number", row_count=5, col_count=1),
# outputs="text",
# title=title,
# description=description,
# article=article,
# examples=[pd.DataFrame([1, 0, 2, 0, 1])],
# cache_examples=False
# ).launch()
# use this to create examples
# data = {'Name':['Tony', 'Steve', 'Bruce', 'Peter' ],
# 'Age': [35, 70, 45, 20] }
# # Creating DataFrame
# df = pd.DataFrame(data)
def filter_records(records, gender):
return records[records["gender"] == gender]
space = gr.Interface(
fn=evaluation,
inputs=[
gr.Dataframe(
headers=["Predicted Label", "Actual Label"],
datatype=["number", "number"],
row_count=5,
col_count=(2, "fixed"),
),
gr.Dataframe(
headers=["Metric", "Averaging Type"],
datatype=["str", "str"],
row_count=3,
col_count=(2, "fixed"),
)
],
outputs="textbox",
title=title,
description=description,
article=article,
cache_examples=False
).launch() |