"""Implements the metrics for evaluation of the diffusion models.""" import math import operator from collections import Counter import numpy as np import scipy import sklearn from mauve import compute_mauve from nltk.util import ngrams from rouge_score import rouge_scorer from scipy import stats MAX_TEXT_LENGTH = 256 default_rouge_scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True) def mauve( predictions, references, featurize_model_name="gpt2-large", length=MAX_TEXT_LENGTH ): """Computes MAUVE scores between two lists of generated text and reference text. Args: predictions (list of str) of predictions. reference (list of str) of references. """ results = compute_mauve( p_text=references, # human-text. q_text=predictions, # machine-text. max_text_length=length, featurize_model_name=featurize_model_name, verbose=False, device_id=0, # These are the tricks to make `mauve` run faster if #examples > 5K. # See https://github.com/krishnap25/mauve#best-practices-for-mauve # num_buckets=500 if len(predictions) > 5000 else "auto", # kmeans_num_redo=1, ) return {"mauve": results.mauve} def distinct_n_grams(texts): """Computes the average distinct n-grams of the generated texts. Args: texts (list of str): representing the generated texts. """ dist_1, dist_2, dist_3, dist_4 = [], [], [], [] for text in texts: total_words = len(text.split()) unigrams = set(ngrams(text.split(), 1)) bigrams = set(ngrams(text.split(), 2)) trigrams = set(ngrams(text.split(), 3)) fourgrams = set(ngrams(text.split(), 4)) if total_words == 0: dist_1.append(0) dist_2.append(0) dist_3.append(0) dist_4.append(0) else: dist_1.append(len(unigrams) / total_words) dist_2.append(len(bigrams) / total_words) dist_3.append(len(trigrams) / total_words) dist_4.append(len(fourgrams) / total_words) return { "dist-1": np.nanmean(dist_1), "dist-2": np.nanmean(dist_2), "dist-3": np.nanmean(dist_3), "dist-4": np.nanmean(dist_4), } def zipf(tokenized_texts, N=5000): """Computes the Zipf coefficient. Args: tokenized_texts (List[List[int]]) tokenized texts. Adapted from https://github.com/ari-holtzman/degen/blob/master/metrics/zipf.py """ cnt = Counter() for tokenized_text in tokenized_texts: cnt.update(tokenized_text) xs = np.arange(1, min(len(cnt), N) + 1) ys = np.array(sorted(cnt.values(), key=operator.neg)[:N]) a, b, r, p, std = stats.linregress(np.log(xs), np.log(ys)) # Note that zipf_minus_a is the reported number. return {"zipf_minus_a": -a, "zipf_minus_r": -r, "zipf_p": p} def accuracy(predictions, targets) -> dict: """Computes the average accuracy.""" return {"accuracy": 100 * ((np.array(predictions) == np.array(targets)).mean())} def pearson_corrcoef(predictions, targets) -> dict: """Computes Pearson correlation coefficient.""" pearson_corrcoef = 100 * scipy.stats.pearsonr(targets, predictions)[0] # Note that if all the predictions will be the same, spearman # correlation is nan, to gaurad against this, we check the output # and return 0 in this case. if math.isnan(pearson_corrcoef): pearson_corrcoef = 0 return {"pearson": pearson_corrcoef} def spearman_corrcoef(predictions, targets) -> dict: """Computes Spearman correlation coefficient.""" spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0] # Note that if all the predictions will be the same, spearman # correlation is nan, to gaurad against this, we check the output # and return 0 in this case. if math.isnan(spearman_corrcoef): spearman_corrcoef = 0 return {"spearmanr": spearman_corrcoef} def f1_score_with_invalid(predictions, targets) -> dict: """Computes F1 score, with any prediction != 0 or 1 is counted as incorrect. Args: targets: list of targets, either 0 or 1 predictions: list of predictions, any integer value Returns: F1 score, where any prediction != 0 or 1 is counted as wrong. """ targets, predictions = np.asarray(targets), np.asarray(predictions) # Get indices of invalid predictions. invalid_idx_mask = np.logical_and(predictions != 0, predictions != 1) # For any prediction != 0 or 1, we set the prediction to the opposite of its corresponding target. predictions[invalid_idx_mask] = 1 - targets[invalid_idx_mask] return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)} # TODO: maybe gaurd against invalid values https://stackoverflow.com/questions/56865344/how-do-i-calculate-the-matthews-correlation-coefficient-in-tensorflow def matthews_corrcoef(predictions, targets) -> dict: """Computes the Matthews correlation coefficient.""" return { "matthews_correlation": 100 * sklearn.metrics.matthews_corrcoef(targets, predictions) } def rouge(predictions, targets) -> dict: """Computes the ROUGE score.""" scores = [ default_rouge_scorer.score(prediction=p, target=t) for p, t in zip(predictions, targets) ] average_scores = {k: np.mean([score[k] for score in scores]) for k in scores[0]} return average_scores def get_glue_metrics(task): GLUE_TASKS_TO_METRICS = { "mrpc": [f1_score_with_invalid, accuracy], "cola": [matthews_corrcoef], "sst2": [accuracy], "stsb": [pearson_corrcoef, spearman_corrcoef], "qqp": [f1_score_with_invalid, accuracy], "mnli": [accuracy], "qnli": [accuracy], "rte": [accuracy], "wnli": [accuracy], "sni": [rouge], } return GLUE_TASKS_TO_METRICS[task]