Spaces:
Sleeping
Sleeping
File size: 14,027 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 |
# Natural Language Toolkit: RIBES Score
#
# Copyright (C) 2001-2023 NLTK Project
# Contributors: Katsuhito Sudoh, Liling Tan, Kasramvd, J.F.Sebastian
# Mark Byers, ekhumoro, P. Ortiz
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
""" RIBES score implementation """
import math
from itertools import islice
from nltk.util import choose, ngrams
def sentence_ribes(references, hypothesis, alpha=0.25, beta=0.10):
"""
The RIBES (Rank-based Intuitive Bilingual Evaluation Score) from
Hideki Isozaki, Tsutomu Hirao, Kevin Duh, Katsuhito Sudoh and
Hajime Tsukada. 2010. "Automatic Evaluation of Translation Quality for
Distant Language Pairs". In Proceedings of EMNLP.
https://www.aclweb.org/anthology/D/D10/D10-1092.pdf
The generic RIBES scores used in shared task, e.g. Workshop for
Asian Translation (WAT) uses the following RIBES calculations:
RIBES = kendall_tau * (alpha**p1) * (beta**bp)
Please note that this re-implementation differs from the official
RIBES implementation and though it emulates the results as describe
in the original paper, there are further optimization implemented
in the official RIBES script.
Users are encouraged to use the official RIBES script instead of this
implementation when evaluating your machine translation system. Refer
to https://www.kecl.ntt.co.jp/icl/lirg/ribes/ for the official script.
:param references: a list of reference sentences
:type references: list(list(str))
:param hypothesis: a hypothesis sentence
:type hypothesis: list(str)
:param alpha: hyperparameter used as a prior for the unigram precision.
:type alpha: float
:param beta: hyperparameter used as a prior for the brevity penalty.
:type beta: float
:return: The best ribes score from one of the references.
:rtype: float
"""
best_ribes = -1.0
# Calculates RIBES for each reference and returns the best score.
for reference in references:
# Collects the *worder* from the ranked correlation alignments.
worder = word_rank_alignment(reference, hypothesis)
nkt = kendall_tau(worder)
# Calculates the brevity penalty
bp = min(1.0, math.exp(1.0 - len(reference) / len(hypothesis)))
# Calculates the unigram precision, *p1*
p1 = len(worder) / len(hypothesis)
_ribes = nkt * (p1**alpha) * (bp**beta)
if _ribes > best_ribes: # Keeps the best score.
best_ribes = _ribes
return best_ribes
def corpus_ribes(list_of_references, hypotheses, alpha=0.25, beta=0.10):
"""
This function "calculates RIBES for a system output (hypothesis) with
multiple references, and returns "best" score among multi-references and
individual scores. The scores are corpus-wise, i.e., averaged by the number
of sentences." (c.f. RIBES version 1.03.1 code).
Different from BLEU's micro-average precision, RIBES calculates the
macro-average precision by averaging the best RIBES score for each pair of
hypothesis and its corresponding references
>>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
... 'ensures', 'that', 'the', 'military', 'always',
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
>>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
... 'ensures', 'that', 'the', 'military', 'will', 'forever',
... 'heed', 'Party', 'commands']
>>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
... 'guarantees', 'the', 'military', 'forces', 'always',
... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
>>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
... 'army', 'always', 'to', 'heed', 'the', 'directions',
... 'of', 'the', 'party']
>>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
... 'interested', 'in', 'world', 'history']
>>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
... 'because', 'he', 'read', 'the', 'book']
>>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
>>> hypotheses = [hyp1, hyp2]
>>> round(corpus_ribes(list_of_references, hypotheses),4)
0.3597
:param references: a corpus of lists of reference sentences, w.r.t. hypotheses
:type references: list(list(list(str)))
:param hypotheses: a list of hypothesis sentences
:type hypotheses: list(list(str))
:param alpha: hyperparameter used as a prior for the unigram precision.
:type alpha: float
:param beta: hyperparameter used as a prior for the brevity penalty.
:type beta: float
:return: The best ribes score from one of the references.
:rtype: float
"""
corpus_best_ribes = 0.0
# Iterate through each hypothesis and their corresponding references.
for references, hypothesis in zip(list_of_references, hypotheses):
corpus_best_ribes += sentence_ribes(references, hypothesis, alpha, beta)
return corpus_best_ribes / len(hypotheses)
def position_of_ngram(ngram, sentence):
"""
This function returns the position of the first instance of the ngram
appearing in a sentence.
Note that one could also use string as follows but the code is a little
convoluted with type casting back and forth:
char_pos = ' '.join(sent)[:' '.join(sent).index(' '.join(ngram))]
word_pos = char_pos.count(' ')
Another way to conceive this is:
return next(i for i, ng in enumerate(ngrams(sentence, len(ngram)))
if ng == ngram)
:param ngram: The ngram that needs to be searched
:type ngram: tuple
:param sentence: The list of tokens to search from.
:type sentence: list(str)
"""
# Iterates through the ngrams in sentence.
for i, sublist in enumerate(ngrams(sentence, len(ngram))):
# Returns the index of the word when ngram matches.
if ngram == sublist:
return i
def word_rank_alignment(reference, hypothesis, character_based=False):
"""
This is the word rank alignment algorithm described in the paper to produce
the *worder* list, i.e. a list of word indices of the hypothesis word orders
w.r.t. the list of reference words.
Below is (H0, R0) example from the Isozaki et al. 2010 paper,
note the examples are indexed from 1 but the results here are indexed from 0:
>>> ref = str('he was interested in world history because he '
... 'read the book').split()
>>> hyp = str('he read the book because he was interested in world '
... 'history').split()
>>> word_rank_alignment(ref, hyp)
[7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
The (H1, R1) example from the paper, note the 0th index:
>>> ref = 'John hit Bob yesterday'.split()
>>> hyp = 'Bob hit John yesterday'.split()
>>> word_rank_alignment(ref, hyp)
[2, 1, 0, 3]
Here is the (H2, R2) example from the paper, note the 0th index here too:
>>> ref = 'the boy read the book'.split()
>>> hyp = 'the book was read by the boy'.split()
>>> word_rank_alignment(ref, hyp)
[3, 4, 2, 0, 1]
:param reference: a reference sentence
:type reference: list(str)
:param hypothesis: a hypothesis sentence
:type hypothesis: list(str)
"""
worder = []
hyp_len = len(hypothesis)
# Stores a list of possible ngrams from the reference sentence.
# This is used for matching context window later in the algorithm.
ref_ngrams = []
hyp_ngrams = []
for n in range(1, len(reference) + 1):
for ng in ngrams(reference, n):
ref_ngrams.append(ng)
for ng in ngrams(hypothesis, n):
hyp_ngrams.append(ng)
for i, h_word in enumerate(hypothesis):
# If word is not in the reference, continue.
if h_word not in reference:
continue
# If we can determine one-to-one word correspondence for unigrams that
# only appear once in both the reference and hypothesis.
elif hypothesis.count(h_word) == reference.count(h_word) == 1:
worder.append(reference.index(h_word))
else:
max_window_size = max(i, hyp_len - i + 1)
for window in range(1, max_window_size):
if i + window < hyp_len: # If searching the right context is possible.
# Retrieve the right context window.
right_context_ngram = tuple(islice(hypothesis, i, i + window + 1))
num_times_in_ref = ref_ngrams.count(right_context_ngram)
num_times_in_hyp = hyp_ngrams.count(right_context_ngram)
# If ngram appears only once in both ref and hyp.
if num_times_in_ref == num_times_in_hyp == 1:
# Find the position of ngram that matched the reference.
pos = position_of_ngram(right_context_ngram, reference)
worder.append(pos) # Add the positions of the ngram.
break
if window <= i: # If searching the left context is possible.
# Retrieve the left context window.
left_context_ngram = tuple(islice(hypothesis, i - window, i + 1))
num_times_in_ref = ref_ngrams.count(left_context_ngram)
num_times_in_hyp = hyp_ngrams.count(left_context_ngram)
if num_times_in_ref == num_times_in_hyp == 1:
# Find the position of ngram that matched the reference.
pos = position_of_ngram(left_context_ngram, reference)
# Add the positions of the ngram.
worder.append(pos + len(left_context_ngram) - 1)
break
return worder
def find_increasing_sequences(worder):
"""
Given the *worder* list, this function groups monotonic +1 sequences.
>>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
>>> list(find_increasing_sequences(worder))
[(7, 8, 9, 10), (0, 1, 2, 3, 4, 5)]
:param worder: The worder list output from word_rank_alignment
:param type: list(int)
"""
items = iter(worder)
a, b = None, next(items, None)
result = [b]
while b is not None:
a, b = b, next(items, None)
if b is not None and a + 1 == b:
result.append(b)
else:
if len(result) > 1:
yield tuple(result)
result = [b]
def kendall_tau(worder, normalize=True):
"""
Calculates the Kendall's Tau correlation coefficient given the *worder*
list of word alignments from word_rank_alignment(), using the formula:
tau = 2 * num_increasing_pairs / num_possible_pairs -1
Note that the no. of increasing pairs can be discontinuous in the *worder*
list and each each increasing sequence can be tabulated as choose(len(seq), 2)
no. of increasing pairs, e.g.
>>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
>>> number_possible_pairs = choose(len(worder), 2)
>>> round(kendall_tau(worder, normalize=False),3)
-0.236
>>> round(kendall_tau(worder),3)
0.382
:param worder: The worder list output from word_rank_alignment
:type worder: list(int)
:param normalize: Flag to indicate normalization to between 0.0 and 1.0.
:type normalize: boolean
:return: The Kendall's Tau correlation coefficient.
:rtype: float
"""
worder_len = len(worder)
# With worder_len < 2, `choose(worder_len, 2)` will be 0.
# As we divide by this, it will give a ZeroDivisionError.
# To avoid this, we can just return the lowest possible score.
if worder_len < 2:
tau = -1
else:
# Extract the groups of increasing/monotonic sequences.
increasing_sequences = find_increasing_sequences(worder)
# Calculate no. of increasing_pairs in *worder* list.
num_increasing_pairs = sum(choose(len(seq), 2) for seq in increasing_sequences)
# Calculate no. of possible pairs.
num_possible_pairs = choose(worder_len, 2)
# Kendall's Tau computation.
tau = 2 * num_increasing_pairs / num_possible_pairs - 1
if normalize: # If normalized, the tau output falls between 0.0 to 1.0
return (tau + 1) / 2
else: # Otherwise, the tau outputs falls between -1.0 to +1.0
return tau
def spearman_rho(worder, normalize=True):
"""
Calculates the Spearman's Rho correlation coefficient given the *worder*
list of word alignment from word_rank_alignment(), using the formula:
rho = 1 - sum(d**2) / choose(len(worder)+1, 3)
Given that d is the sum of difference between the *worder* list of indices
and the original word indices from the reference sentence.
Using the (H0,R0) and (H5, R5) example from the paper
>>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
>>> round(spearman_rho(worder, normalize=False), 3)
-0.591
>>> round(spearman_rho(worder), 3)
0.205
:param worder: The worder list output from word_rank_alignment
:param type: list(int)
"""
worder_len = len(worder)
sum_d_square = sum((wi - i) ** 2 for wi, i in zip(worder, range(worder_len)))
rho = 1 - sum_d_square / choose(worder_len + 1, 3)
if normalize: # If normalized, the rho output falls between 0.0 to 1.0
return (rho + 1) / 2
else: # Otherwise, the rho outputs falls between -1.0 to +1.0
return rho
|