File size: 10,289 Bytes
57060bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24fa801
 
 
91c44b6
57060bd
 
 
 
24fa801
 
 
 
 
 
 
 
 
 
57060bd
 
 
 
 
24fa801
57060bd
 
 
 
 
 
 
35d2d2e
 
 
57060bd
91c44b6
 
24fa801
57060bd
 
 
35d2d2e
 
57060bd
91c44b6
35d2d2e
 
 
91c44b6
35d2d2e
 
 
91c44b6
35d2d2e
57060bd
 
24fa801
621f61c
 
 
24fa801
 
fc51350
 
24fa801
fc51350
 
24fa801
fc51350
24fa801
fc51350
24fa801
 
fc51350
 
 
24fa801
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8abe922
24fa801
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8abe922
24fa801
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c94fbaa
24fa801
 
c94fbaa
 
24fa801
 
57060bd
 
 
 
 
 
 
 
 
 
 
 
 
 
35d2d2e
 
 
 
 
 
 
57060bd
 
 
 
 
 
 
ca6b0e0
57060bd
24fa801
 
 
ca6b0e0
 
24fa801
 
 
214f071
24fa801
35d2d2e
 
 
 
 
24fa801
 
 
 
 
 
 
920f5ed
24fa801
214f071
 
24fa801
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c5256a
35d2d2e
91c44b6
1be7967
91c44b6
3e76455
41ad560
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import evaluate
import datasets
from collections import Counter
from math import log, exp
from random import seed, randint
from numpy import mean, std, round


# TODO: Add BibTeX citation
_CITATION = """\
@InProceedings{napoles-EtAl:2015:ACL-IJCNLP,
  author    = {Napoles, Courtney  and  Sakaguchi, Keisuke  and  Post, Matt  and  Tetreault, Joel},
  title     = {Ground Truth for Grammatical Error Correction Metrics},
  booktitle = {Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)},
  month     = {July},
  year      = {2015},
  address   = {Beijing, China},
  publisher = {Association for Computational Linguistics},
  pages     = {588--593},
  url       = {http://www.aclweb.org/anthology/P15-2097}
}
"""

# TODO: Add description of the module here
_DESCRIPTION = """\
 GLEU metric can be used for any monolingual "translation" task, that is it can be used for Grammatical Error Correction and other text re-writing tasks. BLEU  computes n-gram precisions over the reference but assigns more weight to n-grams that have been correctly changed from the source. GLEU rewards corrections while also correctly crediting unchanged source text. 
"""


# TODO: Add description of the arguments of the module here
_KWARGS_DESCRIPTION = """
Calculates how good are predictions given some references, using certain scores
Args:
    sources: Source language reference sentences. This is assumed to be same as references if not provided.
    references: Reference for each prediction. Each reference should be a string with tokens separated by spaces.
    predictions: list of predictions to score. Each prediction should be a string with tokens separated by spaces.
Returns:
    mean_gleu_score: Average gleu_score over all predictions.
    SD: standard deviation

Examples:

    >>> my_new_module = evaluate.load("my_new_module")
    >>> references=["We may in actual fact be communicating with a hoax Facebook acccount of a cyberfriend , which we assume to be real but in reality , it is a fake account ."]
    >>> results = my_new_module.compute(references=references, predictions=["We may of actual fact communicating with a hoax Facebook acccount of a cyber friend , which we assumed to be real but in reality , it is a fake account ."])
    >>> print(results)
    {'mean_gleu_score': 0.6}
    
    >>> results = my_new_module.compute(references=references, predictions=["We may be in actual fact communicating with a hoax Facebook acccount of a cyber friend , we assume to be real but in reality , it is a fake account ."])
    >>> print(results)
    {'mean_gleu_score': 0.62}
    
    >>> results = my_new_module.compute(references=references, predictions=["We may in actual fact communicating with a hoax Facebook account of a cyber friend , which we assume to be real but in reality , it is a fake accounts ."])
    >>> print(results)
    {'mean_gleu_score': 0.64}
    
"""

class GLEU():
    def __init__(self, order=4) :
        self.order = order
        
    def load_hypothesis_sentence(self,hypothesis) :
        self.hlen = len(hypothesis)
        self.this_h_ngrams = [self.get_ngram_counts(hypothesis,n)
                                for n in range(1,self.order+1) ]
    
    def load_sources(self,source_sents) :
        self.all_s_ngrams = [[self.get_ngram_counts(source_sent.split(),n)
                                for n in range(1,self.order+1) ]
                                    for source_sent in source_sents ]
    
    def load_references(self,ref_sents) :
        self.refs = [ [] for i in range(len(self.all_s_ngrams)) ]
        self.rlens = [ [] for i in range(len(self.all_s_ngrams)) ]
        for i, ref_sent in enumerate(ref_sents) :
            self.refs[i].append(ref_sent.split())
            self.rlens[i].append(len(ref_sent.split()))
    
        # count number of references each n-gram appear sin
        self.all_rngrams_freq = [ Counter() for i in range(self.order) ]
    
        self.all_r_ngrams = [ ]
        for refset in self.refs :
            all_ngrams = []
            self.all_r_ngrams.append(all_ngrams)
    
            for n in range(1,self.order+1) :
                ngrams = self.get_ngram_counts(refset[0],n)
                all_ngrams.append(ngrams)
    
                for k in ngrams.keys() :
                    self.all_rngrams_freq[n-1][k]+=1
    
                for ref in refset[1:] :
                    new_ngrams = self.get_ngram_counts(ref,n)
                    for nn in new_ngrams.elements() :
                        if new_ngrams[nn] > ngrams.get(nn,0) :
                            ngrams[nn] = new_ngrams[nn]
    
    def get_ngram_counts(self,sentence,n) :
        return Counter([tuple(sentence[i:i+n]) for i in range(len(sentence)+1-n)])
    
    # returns ngrams in a but not in b
    def get_ngram_diff(self,a,b) :
        diff = Counter(a)
        for k in (set(a) & set(b)) :
            del diff[k]
        return diff
    
    def normalization(self,ngram,n) :
        return 1.0*self.all_rngrams_freq[n-1][ngram]/len(self.rlens[0])
    
    # Collect BLEU-relevant statistics for a single hypothesis/reference pair.
    # Return value is a generator yielding:
    # (c, r, numerator1, denominator1, ... numerator4, denominator4)
    # Summing the columns across calls to this function on an entire corpus
    # will produce a vector of statistics that can be used to compute GLEU
    def gleu_stats(self,i,r_ind=None):
    
      hlen = self.hlen
      rlen = self.rlens[i][r_ind]
      
      yield hlen
      yield rlen
    
      for n in range(1,self.order+1):
        h_ngrams = self.this_h_ngrams[n-1]
        s_ngrams = self.all_s_ngrams[i][n-1]
        r_ngrams = self.get_ngram_counts(self.refs[i][r_ind],n)
    
        s_ngram_diff = self.get_ngram_diff(s_ngrams,r_ngrams)
    
        yield max([ sum( (h_ngrams & r_ngrams).values() ) - \
                    sum( (h_ngrams & s_ngram_diff).values() ), 0 ])
    
        yield max([hlen+1-n, 0])
    
    # Compute GLEU from collected statistics obtained by call(s) to gleu_stats
    def compute_gleu(self,stats,smooth=False):
        # smooth 0 counts for sentence-level scores
        if smooth :
            stats = [ s if s != 0 else 1 for s in stats ]
        if len(list(filter(lambda x: x==0, stats))) > 0:
            return 0
        (c, r) = stats[:2]
        log_gleu_prec = sum([log(float(x)/y) for x,y in zip(stats[2::2],stats[3::2])]) / 4
        return exp(min([0, 1-float(r)/c]) + log_gleu_prec)
    


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class gleu(evaluate.Metric):
    """TODO: Short description of my evaluation module."""

    def _info(self):
        # TODO: Specifies the evaluate.EvaluationModuleInfo object
        return evaluate.MetricInfo(
            # This is the description that will appear on the modules page.
            module_type="metric",
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            # This defines the format of each prediction and reference
            features=datasets.Features(
                {
                    "predictions": datasets.Value("string", id="sequence"),
                    "references": datasets.Value("string", id="sequence"),
                }
            ),
            codebase_urls=["https://github.com/cnap/gec-ranking/"],
        )

    def _download_and_prepare(self, dl_manager):
        """Optional: download external resources useful to compute the scores"""
        # TODO: Download external resources if needed
        pass

    def _compute(self, references, predictions):
        """Returns the scores"""
        
        num_iterations = 500
        
        order=4
        
        if len(references)==1:
            num_iterations = 1
            
        gleu_calculator = GLEU(order=order)
        
        # if sources:
        #     gleu_calculator.load_sources(sources)
        # else:
        #     
        gleu_calculator.load_sources(references)
        gleu_calculator.load_references(references)
        
        # first generate a random list of indices, using a different seed
        # for each iteration
        indices = []
        for j in range(num_iterations) :
            seed(j*101)
            indices.append([randint(0,len(references)-1) for i in range(len(predictions))])
        
        iter_stats = [[0 for i in range(2*order+2)] for j in range(num_iterations) ]
        
        for i,h in enumerate(predictions) :
            gleu_calculator.load_hypothesis_sentence(h)
            
            # we are going to store the score of this sentence for each ref
            # so we don't have to recalculate them 500 times
            
            stats_by_ref = [ None for r in range(len(references)) ]
            
            for j in range(num_iterations) :
                ref = indices[j][i]
                this_stats = stats_by_ref[ref]
            
                if this_stats is None :
                    this_stats = [ s for s in gleu_calculator.gleu_stats(i,r_ind=ref) ]
                    stats_by_ref[ref] = this_stats
                    
                iter_stats[j] = [sum(scores) for scores in zip(iter_stats[j], this_stats)]
        
        sent_scores = [gleu_calculator.compute_gleu(stats) for stats in iter_stats]
        mean_score = mean(sent_scores)
        std_score = round(std(sent_scores),2)
        print(mean_score, sent_scores)
        return {"mean_gleu_score": mean_score}