import evaluate import sys from pathlib import Path from evaluate.utils import infer_gradio_input_types, json_to_string_type, parse_readme, parse_gradio_data, parse_test_cases def launch_gradio_widget(metric): """Launches `metric` widget with Gradio.""" try: import gradio as gr except ImportError as error: logger.error("To create a metric widget with Gradio make sure gradio is installed.") raise error local_path = Path(sys.path[0]) # if there are several input types, use first as default. if isinstance(metric.features, list): (feature_names, feature_types) = zip(*metric.features[0].items()) else: (feature_names, feature_types) = zip(*metric.features.items()) gradio_input_types = infer_gradio_input_types(feature_types) def compute(data): return metric.compute(**parse_gradio_data(data, gradio_input_types)) header_html = '''

About SEScore

SEScore is a reference-based text-generation evaluation metric that requires no pre-human-annotated error data, described in our paper "Not All Errors are Equal: Learning Text Generation Metrics using Stratified Error Synthesis" from EMNLP 2022.

Its effectiveness over prior methods like BLEU and COMET has been demonstrated on a diverse set of language generation tasks, including translation, captioning, and web text generation. Readers have even described SEScore as "one unsupervised evaluation to rule them all" and we are very excited to share it with you!

Try it yourself!

Provide sample (gold) reference text and (model output) predicted text below and see how SEScore rates them! It is most performant in a relative ranking setting, so in general it will rank better predictions higher than worse ones. Providing useful absolute numbers based on SEScore is an ongoing direction of investigation.

'''.replace('\n',' ') tail_markdown = parse_readme(local_path / "description.md") iface = gr.Interface( fn=compute, inputs=gr.inputs.Dataframe( headers=feature_names, col_count=len(feature_names), row_count=2, datatype=json_to_string_type(gradio_input_types), ), outputs=gr.outputs.Textbox(label=metric.name), description=header_html, #title=f"SEScore Metric Usage Example", article=tail_markdown, # TODO: load test cases and use them to populate examples # examples=[parse_test_cases(test_cases, feature_names, gradio_input_types)] ) print(dir(iface)) iface.launch() module = evaluate.load("xu1998hz/sescore") launch_gradio_widget(module)