Spaces:
Sleeping
Sleeping
File size: 3,488 Bytes
0b0e7aa e5ba913 0b0e7aa 2488f8b c5503a4 0b0e7aa 430f772 967d223 430f772 601b8ff 430f772 8a9fb11 430f772 86e7738 430f772 2488f8b e5ba913 2488f8b 0b0e7aa 8a9fb11 2488f8b b660ba8 f75e383 b660ba8 01ab558 b660ba8 728d57c edd642a b660ba8 430f772 b660ba8 2488f8b f75e383 3a45cce 2488f8b 0b0e7aa 2488f8b f75e383 36affa5 f75e383 2488f8b 430f772 e949e48 0b0e7aa b660ba8 430f772 8a9fb11 9a8c7c5 430f772 8a9fb11 b660ba8 2488f8b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import sys
import gradio as gr
import pandas as pd
import evaluate
from evaluate.utils import infer_gradio_input_types, json_to_string_type, parse_readme, parse_test_cases
# from evaluate.utils import launch_gradio_widget # using this directly is erroneous - lets fix this
from fixed_f1 import FixedF1
from pathlib import Path
added_description = """
See the 🤗 Space showing off how to combine various metrics:
[MarioBarbeque/CombinedEvaluationMetrics🪲](https://huggingface.co/spaces/MarioBarbeque/CombinedEvaluationMetrics). This collected fix thereby circumnavigates the
original, longstanding issue found [here](https://github.com/huggingface/evaluate/issues/234). We look forward to fixing this in a PR soon.
In the specific use case of the `FixedF1` metric, one writes the following:\n
```python
f1 = FixedF1(average=...)
f1.add_batch(predictions=..., references=...)
f1.compute()
```\n
where the `average` parameter can be chosen to configure the way f1 scores across labels are averaged. Acceptable values include `[None, 'micro', 'macro', 'weighted']` (
or `binary` if there exist only two labels). \n
Play around with the interface below to see how the F1 score changes based on predictions, references, and method of averaging!
"""
metric = FixedF1()
if isinstance(metric.features, list):
(feature_names, feature_types) = zip(*metric.features[0].items())
else:
(feature_names, feature_types) = zip(*metric.features.items())
gradio_input_types = infer_gradio_input_types(feature_types)
local_path = Path(sys.path[0])
# configure these randomly using randint generator and feature names?
test_case_1 = [ {"predictions":[1,2,3,4,5], "references":[1,2,5,4,3]} ]
test_case_2 = [ {"predictions":[9,8,7,6,5], "references":[7,8,9,6,5]} ]
# configure this based on the input type, etc. for launch_gradio_widget
def compute(input_df: pd.DataFrame, method: str):
metric = FixedF1(average=method if method != "None" else None)
cols = [col for col in input_df.columns]
predicted = [int(num) for num in input_df[cols[0]].to_list()]
references = [int(num) for num in input_df[cols[1]].to_list()]
metric.add_batch(predictions=predicted, references=references)
outputs = metric.compute()
return f"The F1 score for these predictions is: \n {outputs}"
space = gr.Interface(
fn=compute,
inputs=[
gr.Dataframe(
headers=feature_names,
col_count=len(feature_names),
row_count=5,
datatype=json_to_string_type(gradio_input_types),
),
gr.Radio(
["weighted", "micro", "macro", "None", "binary"],
label="Averaging Method",
info="Method for averaging the F1 score across labels. \n `binary` only works if you are evaluating a binary classification model."
)
],
outputs=gr.Textbox(label=metric.name),
description=metric.info.description + added_description,
title="FixedF1 Metric", # think about how to generalize this with the launch_gradio_widget - it seems fine as is really
article=parse_readme(local_path / "README.md"),
examples=[
[
parse_test_cases(test_case_1, feature_names, gradio_input_types)[0], # notice how we unpack this for when we fix launch_gradio_widget
"weighted"
],
[
parse_test_cases(test_case_2, feature_names, gradio_input_types)[0],
"micro"
],
],
cache_examples=False
)
space.launch() |