Spaces:

MarioBarbeque
/

FixedF1

Sleeping

File size: 3,488 Bytes

0b0e7aa
 
 
e5ba913
0b0e7aa
2488f8b
c5503a4
0b0e7aa
 
430f772
967d223
 
 
430f772
601b8ff
430f772
 
 
 
 
 
 
 
8a9fb11
430f772
86e7738
 
430f772
 
2488f8b
e5ba913
2488f8b
 
 
 
 
 
0b0e7aa
8a9fb11
 
 
2488f8b
b660ba8
f75e383
 
 
b660ba8
01ab558
 
 
b660ba8
728d57c
edd642a
b660ba8
430f772
b660ba8
2488f8b
 
f75e383
 
3a45cce
2488f8b
0b0e7aa
2488f8b
 
f75e383
 
 
36affa5
f75e383
 
2488f8b
430f772
e949e48
0b0e7aa
b660ba8
430f772
8a9fb11
9a8c7c5
430f772
8a9fb11
 
 
 
b660ba8
 
2488f8b

import sys
import gradio as gr
import pandas as pd
import evaluate
from evaluate.utils import infer_gradio_input_types, json_to_string_type, parse_readme, parse_test_cases
# from evaluate.utils import launch_gradio_widget # using this directly is erroneous - lets fix this
from fixed_f1 import FixedF1
from pathlib import Path

added_description = """
See the 🤗 Space showing off how to combine various metrics: 
    [MarioBarbeque/CombinedEvaluationMetrics🪲](https://huggingface.co/spaces/MarioBarbeque/CombinedEvaluationMetrics). This collected fix thereby circumnavigates the 
    original, longstanding issue found [here](https://github.com/huggingface/evaluate/issues/234). We look forward to fixing this in a PR soon.

In the specific use case of the `FixedF1` metric, one writes the following:\n

```python
f1 = FixedF1(average=...)

f1.add_batch(predictions=..., references=...)
f1.compute()
```\n

where the `average` parameter can be chosen to configure the way f1 scores across labels are averaged. Acceptable values include `[None, 'micro', 'macro', 'weighted']` (
or `binary` if there exist only two labels). \n

Play around with the interface below to see how the F1 score changes based on predictions, references, and method of averaging!
"""

metric = FixedF1()

if isinstance(metric.features, list):
    (feature_names, feature_types) = zip(*metric.features[0].items())
else:
    (feature_names, feature_types) = zip(*metric.features.items())
gradio_input_types = infer_gradio_input_types(feature_types)

local_path = Path(sys.path[0])
# configure these randomly using randint generator and feature names?
test_case_1 = [ {"predictions":[1,2,3,4,5], "references":[1,2,5,4,3]} ] 
test_case_2 = [ {"predictions":[9,8,7,6,5], "references":[7,8,9,6,5]} ]

# configure this based on the input type, etc. for launch_gradio_widget
def compute(input_df: pd.DataFrame, method: str):
    
    metric = FixedF1(average=method if method != "None" else None)

    cols = [col for col in input_df.columns]
    predicted = [int(num) for num in input_df[cols[0]].to_list()]
    references = [int(num) for num in input_df[cols[1]].to_list()]

    metric.add_batch(predictions=predicted, references=references)
    outputs = metric.compute()

    return f"The F1 score for these predictions is: \n {outputs}"

space = gr.Interface(
    fn=compute,
    inputs=[
        gr.Dataframe(
        headers=feature_names,
        col_count=len(feature_names),
        row_count=5,
        datatype=json_to_string_type(gradio_input_types),
        ),
        gr.Radio(
            ["weighted", "micro", "macro", "None", "binary"], 
            label="Averaging Method", 
            info="Method for averaging the F1 score across labels. \n `binary` only works if you are evaluating a binary classification model."
        )
    ],
    outputs=gr.Textbox(label=metric.name),
    description=metric.info.description + added_description,
    title="FixedF1 Metric", # think about how to generalize this with the launch_gradio_widget - it seems fine as is really
    article=parse_readme(local_path / "README.md"),
    examples=[
        [
            parse_test_cases(test_case_1, feature_names, gradio_input_types)[0], # notice how we unpack this for when we fix launch_gradio_widget
            "weighted"
        ],
        [
            parse_test_cases(test_case_2, feature_names, gradio_input_types)[0],
            "micro"
        ],
    ],
    cache_examples=False
    )

space.launch()