Spaces:

MarioBarbeque
/

CombinedEvaluationMetrics

Sleeping

File size: 3,249 Bytes

b71d998
 
 
1847cde
b71d998
288a6ca
b71d998
9230b05
 
 
9b29e93
9230b05
9b29e93
9230b05
 
1847cde
6fdccff
9b29e93
 
b71d998
 
9b29e93
1847cde
 
9b29e93
1847cde
9b29e93
 
 
 
b71d998
9b29e93
 
 
9230b05
9b29e93
 
9230b05
9b29e93
b71d998
 
a9c64ab
 
 
 
 
 
 
 
 
 
 
9b29e93
 
 
 
 
 
 
 
 
 
a9c64ab
 
 
9b29e93
 
 
a9c64ab
9b29e93
 
a9c64ab
9b29e93
a9c64ab
9b29e93
 
 
 
 
 
a9c64ab
9b29e93
 
 
 
 
b71d998

from fixed_f1 import FixedF1
from fixed_precision import FixedPrecision
from fixed_recall import FixedRecall
import evaluate
import gradio as gr
import pandas as pd

title = "'Combine' multiple metrics with this 🤗 Evaluate 🪲 Fix!"

description = """<p style='text-align: center'>
As I introduce myself to the entirety of the 🤗 ecosystem, I've put together this Space to show off a temporary fix for a current 🪲 in the 🤗 Evaluate library. \n

Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \
'evaluate.combine()' multiple metrics related to multilabel text classification. Particularly, one cannot 'combine()' the f1, precision, and recall scores for \
evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \
text classification of 805 labeled medical conditions based on drug reviews. \n

This Space shows how one can instantiate these custom metrics each with their own unique methodology for averaging across labels, combine them into a single 
HF `evaluate.EvaluationModule` (or `Metric`), and compute them.</p>
"""

article = "<p style='text-align: center'>Check out the [original repo](https://github.com/johngrahamreynolds/FixedMetricsForHF) housing this code, and a quickly \
trained [multilabel text classification model](https://github.com/johngrahamreynolds/RoBERTa-base-DReiFT/tree/main) that makes use of it during evaluation.</p>"

def evaluation(predictions, metrics) -> str: 

    f1 = FixedF1(average=metrics["f1"])
    precision = FixedPrecision(average=metrics["precision"])
    recall = FixedRecall(average=metrics["recall"])
    combined = evaluate.combine([f1, recall, precision])

    df = predictions.get_dataframe()
    predicted = df["Predicted Label"].to_list()
    references = df["Actual Label"].to_list()

    combined.add_batch(prediction=predicted, reference=references)
    outputs =  combined.compute()

    return "Your metrics are as follows: \n" + outputs


# gr.Interface(
#     fn=show_off,
#     inputs=gr.Dataframe(type="array", datatype="number", row_count=5, col_count=1),
#     outputs="text",
#     title=title,
#     description=description,
#     article=article,
#     examples=[pd.DataFrame([1, 0, 2, 0, 1])],
#     cache_examples=False
# ).launch()

# use this to create examples

# data = {'Name':['Tony', 'Steve', 'Bruce', 'Peter' ],
#         'Age': [35, 70, 45, 20] }  

# # Creating DataFrame  
# df = pd.DataFrame(data) 



def filter_records(records, gender):
    return records[records["gender"] == gender]

space = gr.Interface(
    fn=evaluation,
    inputs=[
        gr.Dataframe(
            headers=["Predicted Label", "Actual Label"],
            datatype=["number", "number"],
            row_count=5,
            col_count=(2, "fixed"),
        ),
        gr.Dataframe(
            headers=["Metric", "Averaging Type"],
            datatype=["str", "str"],
            row_count=3,
            col_count=(2, "fixed"),
        )
    ],
    outputs="textbox",
    title=title,
    description=description,
    article=article,
    cache_examples=False
).launch()