File size: 5,926 Bytes
17ff0d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
"""Implements the metrics for evaluation of the diffusion models."""
import math
import operator
from collections import Counter

import numpy as np
import scipy
import sklearn
from mauve import compute_mauve
from nltk.util import ngrams
from rouge_score import rouge_scorer
from scipy import stats

MAX_TEXT_LENGTH = 256

default_rouge_scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)


def mauve(
    predictions, references, featurize_model_name="gpt2-large", length=MAX_TEXT_LENGTH
):
    """Computes MAUVE scores between two lists of generated text and reference text.
    Args:
    predictions (list of str) of predictions.
    reference (list of str) of references.
    """
    results = compute_mauve(
        p_text=references,  # human-text.
        q_text=predictions,  # machine-text.
        max_text_length=length,
        featurize_model_name=featurize_model_name,
        verbose=False,
        device_id=0,
        # These are the tricks to make `mauve` run faster if #examples > 5K.
        # See https://github.com/krishnap25/mauve#best-practices-for-mauve
        # num_buckets=500 if len(predictions) > 5000 else "auto",
        # kmeans_num_redo=1,
    )
    return {"mauve": results.mauve}


def distinct_n_grams(texts):
    """Computes the average distinct n-grams of the generated texts.
    Args:
        texts (list of str): representing the generated texts.
    """
    dist_1, dist_2, dist_3, dist_4 = [], [], [], []
    for text in texts:
        total_words = len(text.split())
        unigrams = set(ngrams(text.split(), 1))
        bigrams = set(ngrams(text.split(), 2))
        trigrams = set(ngrams(text.split(), 3))
        fourgrams = set(ngrams(text.split(), 4))
        if total_words == 0:
            dist_1.append(0)
            dist_2.append(0)
            dist_3.append(0)
            dist_4.append(0)
        else:
            dist_1.append(len(unigrams) / total_words)
            dist_2.append(len(bigrams) / total_words)
            dist_3.append(len(trigrams) / total_words)
            dist_4.append(len(fourgrams) / total_words)
    return {
        "dist-1": np.nanmean(dist_1),
        "dist-2": np.nanmean(dist_2),
        "dist-3": np.nanmean(dist_3),
        "dist-4": np.nanmean(dist_4),
    }


def zipf(tokenized_texts, N=5000):
    """Computes the Zipf coefficient.

    Args:
        tokenized_texts (List[List[int]]) tokenized texts.
    Adapted from https://github.com/ari-holtzman/degen/blob/master/metrics/zipf.py
    """
    cnt = Counter()
    for tokenized_text in tokenized_texts:
        cnt.update(tokenized_text)

    xs = np.arange(1, min(len(cnt), N) + 1)
    ys = np.array(sorted(cnt.values(), key=operator.neg)[:N])
    a, b, r, p, std = stats.linregress(np.log(xs), np.log(ys))
    # Note that zipf_minus_a is the reported number.
    return {"zipf_minus_a": -a, "zipf_minus_r": -r, "zipf_p": p}


def accuracy(predictions, targets) -> dict:
    """Computes the average accuracy."""
    return {"accuracy": 100 * ((np.array(predictions) == np.array(targets)).mean())}


def pearson_corrcoef(predictions, targets) -> dict:
    """Computes Pearson correlation coefficient."""
    pearson_corrcoef = 100 * scipy.stats.pearsonr(targets, predictions)[0]

    # Note that if all the predictions will be the same, spearman
    # correlation is nan, to gaurad against this, we check the output
    # and return 0 in this case.
    if math.isnan(pearson_corrcoef):
        pearson_corrcoef = 0
    return {"pearson": pearson_corrcoef}


def spearman_corrcoef(predictions, targets) -> dict:
    """Computes Spearman correlation coefficient."""
    spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0]

    # Note that if all the predictions will be the same, spearman
    # correlation is nan, to gaurad against this, we check the output
    # and return 0 in this case.
    if math.isnan(spearman_corrcoef):
        spearman_corrcoef = 0
    return {"spearmanr": spearman_corrcoef}


def f1_score_with_invalid(predictions, targets) -> dict:
    """Computes F1 score,  with any prediction != 0 or 1 is counted as incorrect.
    Args:
      targets: list of targets, either 0 or 1
      predictions: list of predictions, any integer value
    Returns:
      F1 score, where any prediction != 0 or 1 is counted as wrong.
    """
    targets, predictions = np.asarray(targets), np.asarray(predictions)
    # Get indices of invalid predictions.
    invalid_idx_mask = np.logical_and(predictions != 0, predictions != 1)
    # For any prediction != 0 or 1, we set the prediction to the opposite of its corresponding target.
    predictions[invalid_idx_mask] = 1 - targets[invalid_idx_mask]
    return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)}


# TODO: maybe gaurd against invalid values https://stackoverflow.com/questions/56865344/how-do-i-calculate-the-matthews-correlation-coefficient-in-tensorflow
def matthews_corrcoef(predictions, targets) -> dict:
    """Computes the Matthews correlation coefficient."""
    return {
        "matthews_correlation": 100
        * sklearn.metrics.matthews_corrcoef(targets, predictions)
    }


def rouge(predictions, targets) -> dict:
    """Computes the ROUGE score."""
    scores = [
        default_rouge_scorer.score(prediction=p, target=t)
        for p, t in zip(predictions, targets)
    ]
    average_scores = {k: np.mean([score[k] for score in scores]) for k in scores[0]}
    return average_scores


def get_glue_metrics(task):
    GLUE_TASKS_TO_METRICS = {
        "mrpc": [f1_score_with_invalid, accuracy],
        "cola": [matthews_corrcoef],
        "sst2": [accuracy],
        "stsb": [pearson_corrcoef, spearman_corrcoef],
        "qqp": [f1_score_with_invalid, accuracy],
        "mnli": [accuracy],
        "qnli": [accuracy],
        "rte": [accuracy],
        "wnli": [accuracy],
        "sni": [rouge],
    }
    return GLUE_TASKS_TO_METRICS[task]