Spaces:

hamishivi
/

tess-2-demo

Sleeping

App Files Files Community

tess-2-demo / sdlm /metrics /metrics.py

hamishivi

commit

17ff0d8 verified about 2 months ago

raw

history blame contribute delete

5.93 kB

	"""Implements the metrics for evaluation of the diffusion models."""
	import math
	import operator
	from collections import Counter

	import numpy as np
	import scipy
	import sklearn
	from mauve import compute_mauve
	from nltk.util import ngrams
	from rouge_score import rouge_scorer
	from scipy import stats

	MAX_TEXT_LENGTH = 256

	default_rouge_scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)


	def mauve(
	predictions, references, featurize_model_name="gpt2-large", length=MAX_TEXT_LENGTH
	):
	"""Computes MAUVE scores between two lists of generated text and reference text.
	Args:
	predictions (list of str) of predictions.
	reference (list of str) of references.
	"""
	results = compute_mauve(
	p_text=references, # human-text.
	q_text=predictions, # machine-text.
	max_text_length=length,
	featurize_model_name=featurize_model_name,
	verbose=False,
	device_id=0,
	# These are the tricks to make `mauve` run faster if #examples > 5K.
	# See https://github.com/krishnap25/mauve#best-practices-for-mauve
	# num_buckets=500 if len(predictions) > 5000 else "auto",
	# kmeans_num_redo=1,
	)
	return {"mauve": results.mauve}


	def distinct_n_grams(texts):
	"""Computes the average distinct n-grams of the generated texts.
	Args:
	texts (list of str): representing the generated texts.
	"""
	dist_1, dist_2, dist_3, dist_4 = [], [], [], []
	for text in texts:
	total_words = len(text.split())
	unigrams = set(ngrams(text.split(), 1))
	bigrams = set(ngrams(text.split(), 2))
	trigrams = set(ngrams(text.split(), 3))
	fourgrams = set(ngrams(text.split(), 4))
	if total_words == 0:
	dist_1.append(0)
	dist_2.append(0)
	dist_3.append(0)
	dist_4.append(0)
	else:
	dist_1.append(len(unigrams) / total_words)
	dist_2.append(len(bigrams) / total_words)
	dist_3.append(len(trigrams) / total_words)
	dist_4.append(len(fourgrams) / total_words)
	return {
	"dist-1": np.nanmean(dist_1),
	"dist-2": np.nanmean(dist_2),
	"dist-3": np.nanmean(dist_3),
	"dist-4": np.nanmean(dist_4),
	}


	def zipf(tokenized_texts, N=5000):
	"""Computes the Zipf coefficient.

	Args:
	tokenized_texts (List[List[int]]) tokenized texts.
	Adapted from https://github.com/ari-holtzman/degen/blob/master/metrics/zipf.py
	"""
	cnt = Counter()
	for tokenized_text in tokenized_texts:
	cnt.update(tokenized_text)

	xs = np.arange(1, min(len(cnt), N) + 1)
	ys = np.array(sorted(cnt.values(), key=operator.neg)[:N])
	a, b, r, p, std = stats.linregress(np.log(xs), np.log(ys))
	# Note that zipf_minus_a is the reported number.
	return {"zipf_minus_a": -a, "zipf_minus_r": -r, "zipf_p": p}


	def accuracy(predictions, targets) -> dict:
	"""Computes the average accuracy."""
	return {"accuracy": 100 * ((np.array(predictions) == np.array(targets)).mean())}


	def pearson_corrcoef(predictions, targets) -> dict:
	"""Computes Pearson correlation coefficient."""
	pearson_corrcoef = 100 * scipy.stats.pearsonr(targets, predictions)[0]

	# Note that if all the predictions will be the same, spearman
	# correlation is nan, to gaurad against this, we check the output
	# and return 0 in this case.
	if math.isnan(pearson_corrcoef):
	pearson_corrcoef = 0
	return {"pearson": pearson_corrcoef}


	def spearman_corrcoef(predictions, targets) -> dict:
	"""Computes Spearman correlation coefficient."""
	spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0]

	# Note that if all the predictions will be the same, spearman
	# correlation is nan, to gaurad against this, we check the output
	# and return 0 in this case.
	if math.isnan(spearman_corrcoef):
	spearman_corrcoef = 0
	return {"spearmanr": spearman_corrcoef}


	def f1_score_with_invalid(predictions, targets) -> dict:
	"""Computes F1 score, with any prediction != 0 or 1 is counted as incorrect.
	Args:
	targets: list of targets, either 0 or 1
	predictions: list of predictions, any integer value
	Returns:
	F1 score, where any prediction != 0 or 1 is counted as wrong.
	"""
	targets, predictions = np.asarray(targets), np.asarray(predictions)
	# Get indices of invalid predictions.
	invalid_idx_mask = np.logical_and(predictions != 0, predictions != 1)
	# For any prediction != 0 or 1, we set the prediction to the opposite of its corresponding target.
	predictions[invalid_idx_mask] = 1 - targets[invalid_idx_mask]
	return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)}


	# TODO: maybe gaurd against invalid values https://stackoverflow.com/questions/56865344/how-do-i-calculate-the-matthews-correlation-coefficient-in-tensorflow
	def matthews_corrcoef(predictions, targets) -> dict:
	"""Computes the Matthews correlation coefficient."""
	return {
	"matthews_correlation": 100
	* sklearn.metrics.matthews_corrcoef(targets, predictions)
	}


	def rouge(predictions, targets) -> dict:
	"""Computes the ROUGE score."""
	scores = [
	default_rouge_scorer.score(prediction=p, target=t)
	for p, t in zip(predictions, targets)
	]
	average_scores = {k: np.mean([score[k] for score in scores]) for k in scores[0]}
	return average_scores


	def get_glue_metrics(task):
	GLUE_TASKS_TO_METRICS = {
	"mrpc": [f1_score_with_invalid, accuracy],
	"cola": [matthews_corrcoef],
	"sst2": [accuracy],
	"stsb": [pearson_corrcoef, spearman_corrcoef],
	"qqp": [f1_score_with_invalid, accuracy],
	"mnli": [accuracy],
	"qnli": [accuracy],
	"rte": [accuracy],
	"wnli": [accuracy],
	"sni": [rouge],
	}
	return GLUE_TASKS_TO_METRICS[task]