Spaces:
Sleeping
Sleeping
"""Implements the metrics for evaluation of the diffusion models.""" | |
import math | |
import operator | |
from collections import Counter | |
import numpy as np | |
import scipy | |
import sklearn | |
from mauve import compute_mauve | |
from nltk.util import ngrams | |
from rouge_score import rouge_scorer | |
from scipy import stats | |
MAX_TEXT_LENGTH = 256 | |
default_rouge_scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True) | |
def mauve( | |
predictions, references, featurize_model_name="gpt2-large", length=MAX_TEXT_LENGTH | |
): | |
"""Computes MAUVE scores between two lists of generated text and reference text. | |
Args: | |
predictions (list of str) of predictions. | |
reference (list of str) of references. | |
""" | |
results = compute_mauve( | |
p_text=references, # human-text. | |
q_text=predictions, # machine-text. | |
max_text_length=length, | |
featurize_model_name=featurize_model_name, | |
verbose=False, | |
device_id=0, | |
# These are the tricks to make `mauve` run faster if #examples > 5K. | |
# See https://github.com/krishnap25/mauve#best-practices-for-mauve | |
# num_buckets=500 if len(predictions) > 5000 else "auto", | |
# kmeans_num_redo=1, | |
) | |
return {"mauve": results.mauve} | |
def distinct_n_grams(texts): | |
"""Computes the average distinct n-grams of the generated texts. | |
Args: | |
texts (list of str): representing the generated texts. | |
""" | |
dist_1, dist_2, dist_3, dist_4 = [], [], [], [] | |
for text in texts: | |
total_words = len(text.split()) | |
unigrams = set(ngrams(text.split(), 1)) | |
bigrams = set(ngrams(text.split(), 2)) | |
trigrams = set(ngrams(text.split(), 3)) | |
fourgrams = set(ngrams(text.split(), 4)) | |
if total_words == 0: | |
dist_1.append(0) | |
dist_2.append(0) | |
dist_3.append(0) | |
dist_4.append(0) | |
else: | |
dist_1.append(len(unigrams) / total_words) | |
dist_2.append(len(bigrams) / total_words) | |
dist_3.append(len(trigrams) / total_words) | |
dist_4.append(len(fourgrams) / total_words) | |
return { | |
"dist-1": np.nanmean(dist_1), | |
"dist-2": np.nanmean(dist_2), | |
"dist-3": np.nanmean(dist_3), | |
"dist-4": np.nanmean(dist_4), | |
} | |
def zipf(tokenized_texts, N=5000): | |
"""Computes the Zipf coefficient. | |
Args: | |
tokenized_texts (List[List[int]]) tokenized texts. | |
Adapted from https://github.com/ari-holtzman/degen/blob/master/metrics/zipf.py | |
""" | |
cnt = Counter() | |
for tokenized_text in tokenized_texts: | |
cnt.update(tokenized_text) | |
xs = np.arange(1, min(len(cnt), N) + 1) | |
ys = np.array(sorted(cnt.values(), key=operator.neg)[:N]) | |
a, b, r, p, std = stats.linregress(np.log(xs), np.log(ys)) | |
# Note that zipf_minus_a is the reported number. | |
return {"zipf_minus_a": -a, "zipf_minus_r": -r, "zipf_p": p} | |
def accuracy(predictions, targets) -> dict: | |
"""Computes the average accuracy.""" | |
return {"accuracy": 100 * ((np.array(predictions) == np.array(targets)).mean())} | |
def pearson_corrcoef(predictions, targets) -> dict: | |
"""Computes Pearson correlation coefficient.""" | |
pearson_corrcoef = 100 * scipy.stats.pearsonr(targets, predictions)[0] | |
# Note that if all the predictions will be the same, spearman | |
# correlation is nan, to gaurad against this, we check the output | |
# and return 0 in this case. | |
if math.isnan(pearson_corrcoef): | |
pearson_corrcoef = 0 | |
return {"pearson": pearson_corrcoef} | |
def spearman_corrcoef(predictions, targets) -> dict: | |
"""Computes Spearman correlation coefficient.""" | |
spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0] | |
# Note that if all the predictions will be the same, spearman | |
# correlation is nan, to gaurad against this, we check the output | |
# and return 0 in this case. | |
if math.isnan(spearman_corrcoef): | |
spearman_corrcoef = 0 | |
return {"spearmanr": spearman_corrcoef} | |
def f1_score_with_invalid(predictions, targets) -> dict: | |
"""Computes F1 score, with any prediction != 0 or 1 is counted as incorrect. | |
Args: | |
targets: list of targets, either 0 or 1 | |
predictions: list of predictions, any integer value | |
Returns: | |
F1 score, where any prediction != 0 or 1 is counted as wrong. | |
""" | |
targets, predictions = np.asarray(targets), np.asarray(predictions) | |
# Get indices of invalid predictions. | |
invalid_idx_mask = np.logical_and(predictions != 0, predictions != 1) | |
# For any prediction != 0 or 1, we set the prediction to the opposite of its corresponding target. | |
predictions[invalid_idx_mask] = 1 - targets[invalid_idx_mask] | |
return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)} | |
# TODO: maybe gaurd against invalid values https://stackoverflow.com/questions/56865344/how-do-i-calculate-the-matthews-correlation-coefficient-in-tensorflow | |
def matthews_corrcoef(predictions, targets) -> dict: | |
"""Computes the Matthews correlation coefficient.""" | |
return { | |
"matthews_correlation": 100 | |
* sklearn.metrics.matthews_corrcoef(targets, predictions) | |
} | |
def rouge(predictions, targets) -> dict: | |
"""Computes the ROUGE score.""" | |
scores = [ | |
default_rouge_scorer.score(prediction=p, target=t) | |
for p, t in zip(predictions, targets) | |
] | |
average_scores = {k: np.mean([score[k] for score in scores]) for k in scores[0]} | |
return average_scores | |
def get_glue_metrics(task): | |
GLUE_TASKS_TO_METRICS = { | |
"mrpc": [f1_score_with_invalid, accuracy], | |
"cola": [matthews_corrcoef], | |
"sst2": [accuracy], | |
"stsb": [pearson_corrcoef, spearman_corrcoef], | |
"qqp": [f1_score_with_invalid, accuracy], | |
"mnli": [accuracy], | |
"qnli": [accuracy], | |
"rte": [accuracy], | |
"wnli": [accuracy], | |
"sni": [rouge], | |
} | |
return GLUE_TASKS_TO_METRICS[task] | |