Spaces:

MarioBarbeque
/

CombinedEvaluationMetrics

Sleeping

File size: 4,363 Bytes

b71d998
 
 
1847cde
b71d998
288a6ca
b71d998
bcbab79
9230b05
 
9b29e93
9230b05
9b29e93
217c111
9230b05
bcbab79
41d497e
6fdccff
dbc5be0
 
b71d998
 
dbc5be0
 
1847cde
dbc5be0
1847cde
dbc5be0
 
 
 
 
 
 
 
 
 
 
 
 
 
b71d998
86e4acf
 
9230b05
41d497e
dbc5be0
9230b05
23e9f70
b71d998
 
a9c64ab
 
 
 
 
 
 
 
 
 
 
9b29e93
 
 
 
 
 
 
 
 
 
a9c64ab
 
 
9b29e93
 
 
a9c64ab
31ed20e
9b29e93
a9c64ab
9b29e93
9ced07d
a9c64ab
9b29e93
 
 
bcbab79
9b29e93
9ced07d
9b29e93
a9c64ab
bcbab79
9b29e93
 
 
bcbab79
41d497e
718583d
 
41d497e
1f58605
9b29e93
b71d998

from fixed_f1 import FixedF1
from fixed_precision import FixedPrecision
from fixed_recall import FixedRecall
import evaluate
import gradio as gr
import pandas as pd

title = "'Combine' multiple metrics with this 🤗 Evaluate 🪲 Fix!"

description = """<p style='text-align: center'>
As I introduce myself to the entirety of the 🤗 ecosystem, I've put together this Space to show off a temporary fix for a current 🪲 in the 🤗 Evaluate library. \n

Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \
`evaluate.combine()` multiple metrics related to multilabel text classification. Particularly, one cannot `combine` the `f1`, `precision`, and `recall` scores for \
evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \
text classification of 805 labeled medical conditions based on drug reviews. The [following workaround](https://github.com/johngrahamreynolds/FixedMetricsForHF) was 
created to address this. \n

This Space shows how one can instantiate these custom `evaluate.Metric`s, each with their own unique methodology for averaging across labels, before `combine`-ing them into a  
HF `evaluate.CombinedEvaluations` object. From here, we can easily compute each of the metrics simultaneously using `compute`.</p>
"""

article = """<p style='text-align: center'>Check out the [original repo](https://github.com/johngrahamreynolds/FixedMetricsForHF) housing this code, and a quickly \
trained [multilabel text classification model](https://github.com/johngrahamreynolds/RoBERTa-base-DReiFT/tree/main) that makes use of it during evaluation.</p>"""

def evaluation(predictions, metrics) -> str:

    metric_set = set(metrics["Metric"].to_list())
    combined_list = []

    if "f1" in metric_set:
        f1 = FixedF1(average=metrics.loc[metrics["Metric"] == "f1"]["Averaging Type"][0])
        combined_list.append(f1)
    if "precision" in metric_set:
        precision = FixedPrecision(average=metrics.loc[metrics["Metric"] == "precision"]["Averaging Type"][0])
        combined_list.append(precision)
    if "recall" in metric_set:
        recall = FixedRecall(average=metrics.loc[metrics["Metric"] == "recall"]["Averaging Type"][0])
        combined_list.append(recall)
    
    combined = evaluate.combine(combined_list)

    predicted = [int(num) for num in predictions["Predicted Class Label"].to_list()]
    references = [int(num) for num in predictions["Actual Class Label"].to_list()]

    combined.add_batch(predictions=predicted, references=references)
    outputs = combined.compute()

    return f"Your metrics are as follows: \n {outputs}"


# gr.Interface(
#     fn=show_off,
#     inputs=gr.Dataframe(type="array", datatype="number", row_count=5, col_count=1),
#     outputs="text",
#     title=title,
#     description=description,
#     article=article,
#     examples=[pd.DataFrame([1, 0, 2, 0, 1])],
#     cache_examples=False
# ).launch()

# use this to create examples

# data = {'Name':['Tony', 'Steve', 'Bruce', 'Peter' ],
#         'Age': [35, 70, 45, 20] }  

# # Creating DataFrame  
# df = pd.DataFrame(data) 



def filter_records(records, gender):
    return records[records["gender"] == gender]

space = gr.Interface(
    fn=evaluation,
    inputs=[
        gr.Dataframe(
            headers=["Predicted Class Label", "Actual Class Label"],
            datatype=["number", "number"],
            row_count=5,
            col_count=(2, "fixed"),
            label="Table of Predicted vs Actual Class Labels"
        ),
        gr.Dataframe(
            headers=["Metric", "Averaging Type"],
            datatype=["str", "str"],
            row_count=(3, "fixed"),
            col_count=(2, "fixed"),
            label="Table of Metrics and Averaging Method across Labels "
        )
    ],
    outputs="text",
    title=title,
    description=description,
    article=article,
    examples=[
        [
            pd.DataFrame(columns=["Predicted Class Label", "Actual Class Label"], data=[[0,1],[1,1],[2,2],[1,0],[0,0]]),
            pd.DataFrame(columns=["Metric", "Averaging Type"], data=[["f1", "weighted"],["precision", "micro"],["recall", "macro"]])
        ]
    ],
    cache_examples=False
).launch()