tess-2-demo / sdlm /metrics /metrics.py
hamishivi's picture
commit
17ff0d8 verified
"""Implements the metrics for evaluation of the diffusion models."""
import math
import operator
from collections import Counter
import numpy as np
import scipy
import sklearn
from mauve import compute_mauve
from nltk.util import ngrams
from rouge_score import rouge_scorer
from scipy import stats
MAX_TEXT_LENGTH = 256
default_rouge_scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
def mauve(
predictions, references, featurize_model_name="gpt2-large", length=MAX_TEXT_LENGTH
):
"""Computes MAUVE scores between two lists of generated text and reference text.
Args:
predictions (list of str) of predictions.
reference (list of str) of references.
"""
results = compute_mauve(
p_text=references, # human-text.
q_text=predictions, # machine-text.
max_text_length=length,
featurize_model_name=featurize_model_name,
verbose=False,
device_id=0,
# These are the tricks to make `mauve` run faster if #examples > 5K.
# See https://github.com/krishnap25/mauve#best-practices-for-mauve
# num_buckets=500 if len(predictions) > 5000 else "auto",
# kmeans_num_redo=1,
)
return {"mauve": results.mauve}
def distinct_n_grams(texts):
"""Computes the average distinct n-grams of the generated texts.
Args:
texts (list of str): representing the generated texts.
"""
dist_1, dist_2, dist_3, dist_4 = [], [], [], []
for text in texts:
total_words = len(text.split())
unigrams = set(ngrams(text.split(), 1))
bigrams = set(ngrams(text.split(), 2))
trigrams = set(ngrams(text.split(), 3))
fourgrams = set(ngrams(text.split(), 4))
if total_words == 0:
dist_1.append(0)
dist_2.append(0)
dist_3.append(0)
dist_4.append(0)
else:
dist_1.append(len(unigrams) / total_words)
dist_2.append(len(bigrams) / total_words)
dist_3.append(len(trigrams) / total_words)
dist_4.append(len(fourgrams) / total_words)
return {
"dist-1": np.nanmean(dist_1),
"dist-2": np.nanmean(dist_2),
"dist-3": np.nanmean(dist_3),
"dist-4": np.nanmean(dist_4),
}
def zipf(tokenized_texts, N=5000):
"""Computes the Zipf coefficient.
Args:
tokenized_texts (List[List[int]]) tokenized texts.
Adapted from https://github.com/ari-holtzman/degen/blob/master/metrics/zipf.py
"""
cnt = Counter()
for tokenized_text in tokenized_texts:
cnt.update(tokenized_text)
xs = np.arange(1, min(len(cnt), N) + 1)
ys = np.array(sorted(cnt.values(), key=operator.neg)[:N])
a, b, r, p, std = stats.linregress(np.log(xs), np.log(ys))
# Note that zipf_minus_a is the reported number.
return {"zipf_minus_a": -a, "zipf_minus_r": -r, "zipf_p": p}
def accuracy(predictions, targets) -> dict:
"""Computes the average accuracy."""
return {"accuracy": 100 * ((np.array(predictions) == np.array(targets)).mean())}
def pearson_corrcoef(predictions, targets) -> dict:
"""Computes Pearson correlation coefficient."""
pearson_corrcoef = 100 * scipy.stats.pearsonr(targets, predictions)[0]
# Note that if all the predictions will be the same, spearman
# correlation is nan, to gaurad against this, we check the output
# and return 0 in this case.
if math.isnan(pearson_corrcoef):
pearson_corrcoef = 0
return {"pearson": pearson_corrcoef}
def spearman_corrcoef(predictions, targets) -> dict:
"""Computes Spearman correlation coefficient."""
spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0]
# Note that if all the predictions will be the same, spearman
# correlation is nan, to gaurad against this, we check the output
# and return 0 in this case.
if math.isnan(spearman_corrcoef):
spearman_corrcoef = 0
return {"spearmanr": spearman_corrcoef}
def f1_score_with_invalid(predictions, targets) -> dict:
"""Computes F1 score, with any prediction != 0 or 1 is counted as incorrect.
Args:
targets: list of targets, either 0 or 1
predictions: list of predictions, any integer value
Returns:
F1 score, where any prediction != 0 or 1 is counted as wrong.
"""
targets, predictions = np.asarray(targets), np.asarray(predictions)
# Get indices of invalid predictions.
invalid_idx_mask = np.logical_and(predictions != 0, predictions != 1)
# For any prediction != 0 or 1, we set the prediction to the opposite of its corresponding target.
predictions[invalid_idx_mask] = 1 - targets[invalid_idx_mask]
return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)}
# TODO: maybe gaurd against invalid values https://stackoverflow.com/questions/56865344/how-do-i-calculate-the-matthews-correlation-coefficient-in-tensorflow
def matthews_corrcoef(predictions, targets) -> dict:
"""Computes the Matthews correlation coefficient."""
return {
"matthews_correlation": 100
* sklearn.metrics.matthews_corrcoef(targets, predictions)
}
def rouge(predictions, targets) -> dict:
"""Computes the ROUGE score."""
scores = [
default_rouge_scorer.score(prediction=p, target=t)
for p, t in zip(predictions, targets)
]
average_scores = {k: np.mean([score[k] for score in scores]) for k in scores[0]}
return average_scores
def get_glue_metrics(task):
GLUE_TASKS_TO_METRICS = {
"mrpc": [f1_score_with_invalid, accuracy],
"cola": [matthews_corrcoef],
"sst2": [accuracy],
"stsb": [pearson_corrcoef, spearman_corrcoef],
"qqp": [f1_score_with_invalid, accuracy],
"mnli": [accuracy],
"qnli": [accuracy],
"rte": [accuracy],
"wnli": [accuracy],
"sni": [rouge],
}
return GLUE_TASKS_TO_METRICS[task]