Spaces:
Sleeping
Sleeping
File size: 5,926 Bytes
17ff0d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
"""Implements the metrics for evaluation of the diffusion models."""
import math
import operator
from collections import Counter
import numpy as np
import scipy
import sklearn
from mauve import compute_mauve
from nltk.util import ngrams
from rouge_score import rouge_scorer
from scipy import stats
MAX_TEXT_LENGTH = 256
default_rouge_scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
def mauve(
predictions, references, featurize_model_name="gpt2-large", length=MAX_TEXT_LENGTH
):
"""Computes MAUVE scores between two lists of generated text and reference text.
Args:
predictions (list of str) of predictions.
reference (list of str) of references.
"""
results = compute_mauve(
p_text=references, # human-text.
q_text=predictions, # machine-text.
max_text_length=length,
featurize_model_name=featurize_model_name,
verbose=False,
device_id=0,
# These are the tricks to make `mauve` run faster if #examples > 5K.
# See https://github.com/krishnap25/mauve#best-practices-for-mauve
# num_buckets=500 if len(predictions) > 5000 else "auto",
# kmeans_num_redo=1,
)
return {"mauve": results.mauve}
def distinct_n_grams(texts):
"""Computes the average distinct n-grams of the generated texts.
Args:
texts (list of str): representing the generated texts.
"""
dist_1, dist_2, dist_3, dist_4 = [], [], [], []
for text in texts:
total_words = len(text.split())
unigrams = set(ngrams(text.split(), 1))
bigrams = set(ngrams(text.split(), 2))
trigrams = set(ngrams(text.split(), 3))
fourgrams = set(ngrams(text.split(), 4))
if total_words == 0:
dist_1.append(0)
dist_2.append(0)
dist_3.append(0)
dist_4.append(0)
else:
dist_1.append(len(unigrams) / total_words)
dist_2.append(len(bigrams) / total_words)
dist_3.append(len(trigrams) / total_words)
dist_4.append(len(fourgrams) / total_words)
return {
"dist-1": np.nanmean(dist_1),
"dist-2": np.nanmean(dist_2),
"dist-3": np.nanmean(dist_3),
"dist-4": np.nanmean(dist_4),
}
def zipf(tokenized_texts, N=5000):
"""Computes the Zipf coefficient.
Args:
tokenized_texts (List[List[int]]) tokenized texts.
Adapted from https://github.com/ari-holtzman/degen/blob/master/metrics/zipf.py
"""
cnt = Counter()
for tokenized_text in tokenized_texts:
cnt.update(tokenized_text)
xs = np.arange(1, min(len(cnt), N) + 1)
ys = np.array(sorted(cnt.values(), key=operator.neg)[:N])
a, b, r, p, std = stats.linregress(np.log(xs), np.log(ys))
# Note that zipf_minus_a is the reported number.
return {"zipf_minus_a": -a, "zipf_minus_r": -r, "zipf_p": p}
def accuracy(predictions, targets) -> dict:
"""Computes the average accuracy."""
return {"accuracy": 100 * ((np.array(predictions) == np.array(targets)).mean())}
def pearson_corrcoef(predictions, targets) -> dict:
"""Computes Pearson correlation coefficient."""
pearson_corrcoef = 100 * scipy.stats.pearsonr(targets, predictions)[0]
# Note that if all the predictions will be the same, spearman
# correlation is nan, to gaurad against this, we check the output
# and return 0 in this case.
if math.isnan(pearson_corrcoef):
pearson_corrcoef = 0
return {"pearson": pearson_corrcoef}
def spearman_corrcoef(predictions, targets) -> dict:
"""Computes Spearman correlation coefficient."""
spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0]
# Note that if all the predictions will be the same, spearman
# correlation is nan, to gaurad against this, we check the output
# and return 0 in this case.
if math.isnan(spearman_corrcoef):
spearman_corrcoef = 0
return {"spearmanr": spearman_corrcoef}
def f1_score_with_invalid(predictions, targets) -> dict:
"""Computes F1 score, with any prediction != 0 or 1 is counted as incorrect.
Args:
targets: list of targets, either 0 or 1
predictions: list of predictions, any integer value
Returns:
F1 score, where any prediction != 0 or 1 is counted as wrong.
"""
targets, predictions = np.asarray(targets), np.asarray(predictions)
# Get indices of invalid predictions.
invalid_idx_mask = np.logical_and(predictions != 0, predictions != 1)
# For any prediction != 0 or 1, we set the prediction to the opposite of its corresponding target.
predictions[invalid_idx_mask] = 1 - targets[invalid_idx_mask]
return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)}
# TODO: maybe gaurd against invalid values https://stackoverflow.com/questions/56865344/how-do-i-calculate-the-matthews-correlation-coefficient-in-tensorflow
def matthews_corrcoef(predictions, targets) -> dict:
"""Computes the Matthews correlation coefficient."""
return {
"matthews_correlation": 100
* sklearn.metrics.matthews_corrcoef(targets, predictions)
}
def rouge(predictions, targets) -> dict:
"""Computes the ROUGE score."""
scores = [
default_rouge_scorer.score(prediction=p, target=t)
for p, t in zip(predictions, targets)
]
average_scores = {k: np.mean([score[k] for score in scores]) for k in scores[0]}
return average_scores
def get_glue_metrics(task):
GLUE_TASKS_TO_METRICS = {
"mrpc": [f1_score_with_invalid, accuracy],
"cola": [matthews_corrcoef],
"sst2": [accuracy],
"stsb": [pearson_corrcoef, spearman_corrcoef],
"qqp": [f1_score_with_invalid, accuracy],
"mnli": [accuracy],
"qnli": [accuracy],
"rte": [accuracy],
"wnli": [accuracy],
"sni": [rouge],
}
return GLUE_TASKS_TO_METRICS[task]
|