Spaces:

MarioBarbeque
/

FixedF1

Sleeping

File size: 3,079 Bytes

0b0e7aa
 
 
e5ba913
0b0e7aa
2488f8b
c5503a4
0b0e7aa
 
430f772
8a9fb11
430f772
 
 
 
 
 
 
 
 
 
 
8a9fb11
430f772
 
 
2488f8b
e5ba913
2488f8b
 
 
 
 
 
0b0e7aa
8a9fb11
 
 
2488f8b
b660ba8
f75e383
 
 
b660ba8
01ab558
 
 
b660ba8
728d57c
edd642a
b660ba8
430f772
b660ba8
2488f8b
 
f75e383
 
3a45cce
2488f8b
0b0e7aa
2488f8b
 
f75e383
 
 
36affa5
f75e383
 
2488f8b
430f772
2488f8b
0b0e7aa
b660ba8
430f772
8a9fb11
9a8c7c5
430f772
8a9fb11
 
 
 
b660ba8
 
2488f8b

import sys
import gradio as gr
import pandas as pd
import evaluate
from evaluate.utils import infer_gradio_input_types, json_to_string_type, parse_readme, parse_test_cases
# from evaluate.utils import launch_gradio_widget # using this directly is erroneous - lets fix this
from fixed_f1 import FixedF1
from pathlib import Path

added_description = """
See the 🤗 Space showing off how to combine various metrics here: 
    [MarioBarbeque/CombinedEvaluationMetrics](https://huggingface.co/spaces/MarioBarbeque/CombinedEvaluationMetrics)

In the specific use case of the `F1Fixed` metric, one writes the following:\n

```python
f1 = FixedF1(average=...)

f1.add_batch(predictions=..., references=...)
f1.compute()
```\n

where the `average` parameter can be chosen to configure the way f1 scores across labels are averaged. Acceptable values include `[None, 'micro', 'macro', 'weighted']` (
or `binary` if there exist only two labels). \n
"""

metric = FixedF1()

if isinstance(metric.features, list):
    (feature_names, feature_types) = zip(*metric.features[0].items())
else:
    (feature_names, feature_types) = zip(*metric.features.items())
gradio_input_types = infer_gradio_input_types(feature_types)

local_path = Path(sys.path[0])
# configure these randomly using randint generator and feature names?
test_case_1 = [ {"predictions":[1,2,3,4,5], "references":[1,2,5,4,3]} ] 
test_case_2 = [ {"predictions":[9,8,7,6,5], "references":[7,8,9,6,5]} ]

# configure this based on the input type, etc. for launch_gradio_widget
def compute(input_df: pd.DataFrame, method: str):
    
    metric = FixedF1(average=method if method != "None" else None)

    cols = [col for col in input_df.columns]
    predicted = [int(num) for num in input_df[cols[0]].to_list()]
    references = [int(num) for num in input_df[cols[1]].to_list()]

    metric.add_batch(predictions=predicted, references=references)
    outputs = metric.compute()

    return f"The F1 score for these predictions is: \n {outputs}"

space = gr.Interface(
    fn=compute,
    inputs=[
        gr.Dataframe(
        headers=feature_names,
        col_count=len(feature_names),
        row_count=5,
        datatype=json_to_string_type(gradio_input_types),
        ),
        gr.Radio(
            ["weighted", "micro", "macro", "None", "binary"], 
            label="Averaging Method", 
            info="Method for averaging the F1 score across labels. \n `binary` only works if you are evaluating a binary classification model."
        )
    ],
    outputs=gr.Textbox(label=metric.name),
    description=metric.info.description + added_description,
    title=f"Metric: {metric.name}",
    article=parse_readme(local_path / "README.md"),
    examples=[
        [
            parse_test_cases(test_case_1, feature_names, gradio_input_types)[0], # notice how we unpack this for when we fix launch_gradio_widget
            "weighted"
        ],
        [
            parse_test_cases(test_case_2, feature_names, gradio_input_types)[0],
            "micro"
        ],
    ],
    cache_examples=False
    )

space.launch()