Spaces:

MarioBarbeque
/

CombinedEvaluationMetrics

Sleeping

John Graham Reynolds

took notes from demos for another attempt

9b29e93 8 months ago

3.25 kB

	from fixed_f1 import FixedF1
	from fixed_precision import FixedPrecision
	from fixed_recall import FixedRecall
	import evaluate
	import gradio as gr
	import pandas as pd

	title = "'Combine' multiple metrics with this 🤗 Evaluate 🪲 Fix!"

	description = """<p style='text-align: center'>
	As I introduce myself to the entirety of the 🤗 ecosystem, I've put together this Space to show off a temporary fix for a current 🪲 in the 🤗 Evaluate library. \n

	Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \
	'evaluate.combine()' multiple metrics related to multilabel text classification. Particularly, one cannot 'combine()' the f1, precision, and recall scores for \
	evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \
	text classification of 805 labeled medical conditions based on drug reviews. \n

	This Space shows how one can instantiate these custom metrics each with their own unique methodology for averaging across labels, combine them into a single
	HF `evaluate.EvaluationModule` (or `Metric`), and compute them.</p>
	"""

	article = "<p style='text-align: center'>Check out the [original repo](https://github.com/johngrahamreynolds/FixedMetricsForHF) housing this code, and a quickly \
	trained [multilabel text classification model](https://github.com/johngrahamreynolds/RoBERTa-base-DReiFT/tree/main) that makes use of it during evaluation.</p>"

	def evaluation(predictions, metrics) -> str:

	f1 = FixedF1(average=metrics["f1"])
	precision = FixedPrecision(average=metrics["precision"])
	recall = FixedRecall(average=metrics["recall"])
	combined = evaluate.combine([f1, recall, precision])

	df = predictions.get_dataframe()
	predicted = df["Predicted Label"].to_list()
	references = df["Actual Label"].to_list()

	combined.add_batch(prediction=predicted, reference=references)
	outputs = combined.compute()

	return "Your metrics are as follows: \n" + outputs


	# gr.Interface(
	# fn=show_off,
	# inputs=gr.Dataframe(type="array", datatype="number", row_count=5, col_count=1),
	# outputs="text",
	# title=title,
	# description=description,
	# article=article,
	# examples=[pd.DataFrame([1, 0, 2, 0, 1])],
	# cache_examples=False
	# ).launch()

	# use this to create examples

	# data = {'Name':['Tony', 'Steve', 'Bruce', 'Peter' ],
	# 'Age': [35, 70, 45, 20] }

	# # Creating DataFrame
	# df = pd.DataFrame(data)



	def filter_records(records, gender):
	return records[records["gender"] == gender]

	space = gr.Interface(
	fn=evaluation,
	inputs=[
	gr.Dataframe(
	headers=["Predicted Label", "Actual Label"],
	datatype=["number", "number"],
	row_count=5,
	col_count=(2, "fixed"),
	),
	gr.Dataframe(
	headers=["Metric", "Averaging Type"],
	datatype=["str", "str"],
	row_count=3,
	col_count=(2, "fixed"),
	)
	],
	outputs="textbox",
	title=title,
	description=description,
	article=article,
	cache_examples=False
	).launch()