|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""BLEU score implementation."""
|
|
|
|
import math
|
|
import sys
|
|
from fractions import Fraction
|
|
import warnings
|
|
from collections import Counter
|
|
|
|
from utils import ngrams
|
|
import pdb
|
|
|
|
|
|
def sentence_bleu(
|
|
references,
|
|
hypothesis,
|
|
weights=(0.25, 0.25, 0.25, 0.25),
|
|
smoothing_function=None,
|
|
auto_reweigh=False,
|
|
):
|
|
"""
|
|
Calculate BLEU score (Bilingual Evaluation Understudy) from
|
|
Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
|
|
"BLEU: a method for automatic evaluation of machine translation."
|
|
In Proceedings of ACL. http://www.aclweb.org/anthology/P02-1040.pdf
|
|
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
|
... 'ensures', 'that', 'the', 'military', 'always',
|
|
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
|
>>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
|
|
... 'forever', 'hearing', 'the', 'activity', 'guidebook',
|
|
... 'that', 'party', 'direct']
|
|
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
|
... 'ensures', 'that', 'the', 'military', 'will', 'forever',
|
|
... 'heed', 'Party', 'commands']
|
|
>>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
|
... 'guarantees', 'the', 'military', 'forces', 'always',
|
|
... 'being', 'under', 'the', 'command', 'of', 'the',
|
|
... 'Party']
|
|
>>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
|
... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
|
... 'of', 'the', 'party']
|
|
>>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
|
|
0.5045...
|
|
If there is no ngrams overlap for any order of n-grams, BLEU returns the
|
|
value 0. This is because the precision for the order of n-grams without
|
|
overlap is 0, and the geometric mean in the final BLEU score computation
|
|
multiplies the 0 with the precision of other n-grams. This results in 0
|
|
(independently of the precision of the othe n-gram orders). The following
|
|
example has zero 3-gram and 4-gram overlaps:
|
|
>>> round(sentence_bleu([reference1, reference2, reference3], hypothesis2),4) # doctest: +ELLIPSIS
|
|
0.0
|
|
To avoid this harsh behaviour when no ngram overlaps are found a smoothing
|
|
function can be used.
|
|
>>> chencherry = SmoothingFunction()
|
|
>>> sentence_bleu([reference1, reference2, reference3], hypothesis2,
|
|
... smoothing_function=chencherry.method1) # doctest: +ELLIPSIS
|
|
0.0370...
|
|
The default BLEU calculates a score for up to 4-grams using uniform
|
|
weights (this is called BLEU-4). To evaluate your translations with
|
|
higher/lower order ngrams, use customized weights. E.g. when accounting
|
|
for up to 5-grams with uniform weights (this is called BLEU-5) use:
|
|
>>> weights = (1./5., 1./5., 1./5., 1./5., 1./5.)
|
|
>>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
|
|
0.3920...
|
|
:param references: reference sentences
|
|
:type references: list(list(str))
|
|
:param hypothesis: a hypothesis sentence
|
|
:type hypothesis: list(str)
|
|
:param weights: weights for unigrams, bigrams, trigrams and so on
|
|
:type weights: list(float)
|
|
:param smoothing_function:
|
|
:type smoothing_function: SmoothingFunction
|
|
:param auto_reweigh: Option to re-normalize the weights uniformly.
|
|
:type auto_reweigh: bool
|
|
:return: The sentence-level BLEU score.
|
|
:rtype: float
|
|
"""
|
|
return corpus_bleu(
|
|
[references], [hypothesis], weights, smoothing_function, auto_reweigh
|
|
)
|
|
|
|
|
|
def corpus_bleu(
|
|
list_of_references,
|
|
hypotheses,
|
|
weights=(0.25, 0.25, 0.25, 0.25),
|
|
smoothing_function=None,
|
|
auto_reweigh=False,
|
|
):
|
|
"""
|
|
Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
|
|
the hypotheses and their respective references.
|
|
Instead of averaging the sentence level BLEU scores (i.e. marco-average
|
|
precision), the original BLEU metric (Papineni et al. 2002) accounts for
|
|
the micro-average precision (i.e. summing the numerators and denominators
|
|
for each hypothesis-reference(s) pairs before the division).
|
|
>>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
|
... 'ensures', 'that', 'the', 'military', 'always',
|
|
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
|
>>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
|
... 'ensures', 'that', 'the', 'military', 'will', 'forever',
|
|
... 'heed', 'Party', 'commands']
|
|
>>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
|
... 'guarantees', 'the', 'military', 'forces', 'always',
|
|
... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
|
|
>>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
|
... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
|
... 'of', 'the', 'party']
|
|
>>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
|
|
... 'interested', 'in', 'world', 'history']
|
|
>>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
|
|
... 'because', 'he', 'read', 'the', 'book']
|
|
>>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
|
|
>>> hypotheses = [hyp1, hyp2]
|
|
>>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
|
|
0.5920...
|
|
The example below show that corpus_bleu() is different from averaging
|
|
sentence_bleu() for hypotheses
|
|
>>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
|
|
>>> score2 = sentence_bleu([ref2a], hyp2)
|
|
>>> (score1 + score2) / 2 # doctest: +ELLIPSIS
|
|
0.6223...
|
|
:param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses
|
|
:type list_of_references: list(list(list(str)))
|
|
:param hypotheses: a list of hypothesis sentences
|
|
:type hypotheses: list(list(str))
|
|
:param weights: weights for unigrams, bigrams, trigrams and so on
|
|
:type weights: list(float)
|
|
:param smoothing_function:
|
|
:type smoothing_function: SmoothingFunction
|
|
:param auto_reweigh: Option to re-normalize the weights uniformly.
|
|
:type auto_reweigh: bool
|
|
:return: The corpus-level BLEU score.
|
|
:rtype: float
|
|
"""
|
|
|
|
|
|
p_numerators = Counter()
|
|
p_denominators = Counter()
|
|
hyp_lengths, ref_lengths = 0, 0
|
|
|
|
assert len(list_of_references) == len(hypotheses), (
|
|
"The number of hypotheses and their reference(s) should be the " "same "
|
|
)
|
|
|
|
|
|
for references, hypothesis in zip(list_of_references, hypotheses):
|
|
|
|
|
|
for i, _ in enumerate(weights, start=1):
|
|
p_i = modified_precision(references, hypothesis, i)
|
|
p_numerators[i] += p_i.numerator
|
|
p_denominators[i] += p_i.denominator
|
|
|
|
|
|
|
|
hyp_len = len(hypothesis)
|
|
hyp_lengths += hyp_len
|
|
ref_lengths += closest_ref_length(references, hyp_len)
|
|
|
|
|
|
bp = brevity_penalty(ref_lengths, hyp_lengths)
|
|
|
|
|
|
|
|
if auto_reweigh:
|
|
if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
|
|
weights = (1 / hyp_lengths,) * hyp_lengths
|
|
|
|
|
|
p_n = [
|
|
Fraction(p_numerators[i], p_denominators[i], _normalize=False)
|
|
for i, _ in enumerate(weights, start=1)
|
|
]
|
|
|
|
|
|
|
|
|
|
if p_numerators[1] == 0:
|
|
return 0
|
|
|
|
|
|
if not smoothing_function:
|
|
smoothing_function = SmoothingFunction().method1
|
|
|
|
|
|
|
|
|
|
p_n = smoothing_function(
|
|
p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths
|
|
)
|
|
s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n))
|
|
s = bp * math.exp(math.fsum(s))
|
|
return s
|
|
|
|
|
|
def modified_precision(references, hypothesis, n):
|
|
"""
|
|
Calculate modified ngram precision.
|
|
The normal precision method may lead to some wrong translations with
|
|
high-precision, e.g., the translation, in which a word of reference
|
|
repeats several times, has very high precision.
|
|
This function only returns the Fraction object that contains the numerator
|
|
and denominator necessary to calculate the corpus-level precision.
|
|
To calculate the modified precision for a single pair of hypothesis and
|
|
references, cast the Fraction object into a float.
|
|
The famous "the the the ... " example shows that you can get BLEU precision
|
|
by duplicating high frequency words.
|
|
>>> reference1 = 'the cat is on the mat'.split()
|
|
>>> reference2 = 'there is a cat on the mat'.split()
|
|
>>> hypothesis1 = 'the the the the the the the'.split()
|
|
>>> references = [reference1, reference2]
|
|
>>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
|
|
0.2857...
|
|
In the modified n-gram precision, a reference word will be considered
|
|
exhausted after a matching hypothesis word is identified, e.g.
|
|
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
|
... 'ensures', 'that', 'the', 'military', 'will',
|
|
... 'forever', 'heed', 'Party', 'commands']
|
|
>>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
|
... 'guarantees', 'the', 'military', 'forces', 'always',
|
|
... 'being', 'under', 'the', 'command', 'of', 'the',
|
|
... 'Party']
|
|
>>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
|
... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
|
... 'of', 'the', 'party']
|
|
>>> hypothesis = 'of the'.split()
|
|
>>> references = [reference1, reference2, reference3]
|
|
>>> float(modified_precision(references, hypothesis, n=1))
|
|
1.0
|
|
>>> float(modified_precision(references, hypothesis, n=2))
|
|
1.0
|
|
An example of a normal machine translation hypothesis:
|
|
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
|
|
... 'ensures', 'that', 'the', 'military', 'always',
|
|
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
|
|
>>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
|
|
... 'forever', 'hearing', 'the', 'activity', 'guidebook',
|
|
... 'that', 'party', 'direct']
|
|
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
|
|
... 'ensures', 'that', 'the', 'military', 'will',
|
|
... 'forever', 'heed', 'Party', 'commands']
|
|
>>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
|
|
... 'guarantees', 'the', 'military', 'forces', 'always',
|
|
... 'being', 'under', 'the', 'command', 'of', 'the',
|
|
... 'Party']
|
|
>>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
|
|
... 'army', 'always', 'to', 'heed', 'the', 'directions',
|
|
... 'of', 'the', 'party']
|
|
>>> references = [reference1, reference2, reference3]
|
|
>>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
|
|
0.9444...
|
|
>>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS
|
|
0.5714...
|
|
>>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS
|
|
0.5882352941176471
|
|
>>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS
|
|
0.07692...
|
|
:param references: A list of reference translations.
|
|
:type references: list(list(str))
|
|
:param hypothesis: A hypothesis translation.
|
|
:type hypothesis: list(str)
|
|
:param n: The ngram order.
|
|
:type n: int
|
|
:return: BLEU's modified precision for the nth order ngram.
|
|
:rtype: Fraction
|
|
"""
|
|
|
|
|
|
|
|
counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter()
|
|
|
|
|
|
max_counts = {}
|
|
for reference in references:
|
|
reference_counts = (
|
|
Counter(ngrams(reference, n)) if len(reference) >= n else Counter()
|
|
)
|
|
for ngram in counts:
|
|
max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])
|
|
|
|
|
|
clipped_counts = {
|
|
ngram: min(count, max_counts[ngram]) for ngram, count in counts.items()
|
|
}
|
|
|
|
numerator = sum(clipped_counts.values())
|
|
|
|
|
|
denominator = max(1, sum(counts.values()))
|
|
|
|
return Fraction(numerator, denominator, _normalize=False)
|
|
|
|
|
|
def closest_ref_length(references, hyp_len):
|
|
"""
|
|
This function finds the reference that is the closest length to the
|
|
hypothesis. The closest reference length is referred to as *r* variable
|
|
from the brevity penalty formula in Papineni et. al. (2002)
|
|
:param references: A list of reference translations.
|
|
:type references: list(list(str))
|
|
:param hyp_len: The length of the hypothesis.
|
|
:type hyp_len: int
|
|
:return: The length of the reference that's closest to the hypothesis.
|
|
:rtype: int
|
|
"""
|
|
ref_lens = (len(reference) for reference in references)
|
|
closest_ref_len = min(
|
|
ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len)
|
|
)
|
|
return closest_ref_len
|
|
|
|
|
|
def brevity_penalty(closest_ref_len, hyp_len):
|
|
"""
|
|
Calculate brevity penalty.
|
|
As the modified n-gram precision still has the problem from the short
|
|
length sentence, brevity penalty is used to modify the overall BLEU
|
|
score according to length.
|
|
An example from the paper. There are three references with length 12, 15
|
|
and 17. And a concise hypothesis of the length 12. The brevity penalty is 1.
|
|
>>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
|
|
>>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15
|
|
>>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17
|
|
>>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
|
|
>>> references = [reference1, reference2, reference3]
|
|
>>> hyp_len = len(hypothesis)
|
|
>>> closest_ref_len = closest_ref_length(references, hyp_len)
|
|
>>> brevity_penalty(closest_ref_len, hyp_len)
|
|
1.0
|
|
In case a hypothesis translation is shorter than the references, penalty is
|
|
applied.
|
|
>>> references = [['a'] * 28, ['a'] * 28]
|
|
>>> hypothesis = ['a'] * 12
|
|
>>> hyp_len = len(hypothesis)
|
|
>>> closest_ref_len = closest_ref_length(references, hyp_len)
|
|
>>> brevity_penalty(closest_ref_len, hyp_len)
|
|
0.2635971381157267
|
|
The length of the closest reference is used to compute the penalty. If the
|
|
length of a hypothesis is 12, and the reference lengths are 13 and 2, the
|
|
penalty is applied because the hypothesis length (12) is less then the
|
|
closest reference length (13).
|
|
>>> references = [['a'] * 13, ['a'] * 2]
|
|
>>> hypothesis = ['a'] * 12
|
|
>>> hyp_len = len(hypothesis)
|
|
>>> closest_ref_len = closest_ref_length(references, hyp_len)
|
|
>>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
|
|
0.9200...
|
|
The brevity penalty doesn't depend on reference order. More importantly,
|
|
when two reference sentences are at the same distance, the shortest
|
|
reference sentence length is used.
|
|
>>> references = [['a'] * 13, ['a'] * 11]
|
|
>>> hypothesis = ['a'] * 12
|
|
>>> hyp_len = len(hypothesis)
|
|
>>> closest_ref_len = closest_ref_length(references, hyp_len)
|
|
>>> bp1 = brevity_penalty(closest_ref_len, hyp_len)
|
|
>>> hyp_len = len(hypothesis)
|
|
>>> closest_ref_len = closest_ref_length(reversed(references), hyp_len)
|
|
>>> bp2 = brevity_penalty(closest_ref_len, hyp_len)
|
|
>>> bp1 == bp2 == 1
|
|
True
|
|
A test example from mteval-v13a.pl (starting from the line 705):
|
|
>>> references = [['a'] * 11, ['a'] * 8]
|
|
>>> hypothesis = ['a'] * 7
|
|
>>> hyp_len = len(hypothesis)
|
|
>>> closest_ref_len = closest_ref_length(references, hyp_len)
|
|
>>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
|
|
0.8668...
|
|
>>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
|
|
>>> hypothesis = ['a'] * 7
|
|
>>> hyp_len = len(hypothesis)
|
|
>>> closest_ref_len = closest_ref_length(references, hyp_len)
|
|
>>> brevity_penalty(closest_ref_len, hyp_len)
|
|
1.0
|
|
:param hyp_len: The length of the hypothesis for a single sentence OR the
|
|
sum of all the hypotheses' lengths for a corpus
|
|
:type hyp_len: int
|
|
:param closest_ref_len: The length of the closest reference for a single
|
|
hypothesis OR the sum of all the closest references for every hypotheses.
|
|
:type closest_ref_len: int
|
|
:return: BLEU's brevity penalty.
|
|
:rtype: float
|
|
"""
|
|
if hyp_len > closest_ref_len:
|
|
return 1
|
|
|
|
elif hyp_len == 0:
|
|
return 0
|
|
else:
|
|
return math.exp(1 - closest_ref_len / hyp_len)
|
|
|
|
|
|
class SmoothingFunction:
|
|
"""
|
|
This is an implementation of the smoothing techniques
|
|
for segment-level BLEU scores that was presented in
|
|
Boxing Chen and Collin Cherry (2014) A Systematic Comparison of
|
|
Smoothing Techniques for Sentence-Level BLEU. In WMT14.
|
|
http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
|
|
"""
|
|
|
|
def __init__(self, epsilon=0.1, alpha=5, k=5):
|
|
"""
|
|
This will initialize the parameters required for the various smoothing
|
|
techniques, the default values are set to the numbers used in the
|
|
experiments from Chen and Cherry (2014).
|
|
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures',
|
|
... 'that', 'the', 'military', 'always', 'obeys', 'the',
|
|
... 'commands', 'of', 'the', 'party']
|
|
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures',
|
|
... 'that', 'the', 'military', 'will', 'forever', 'heed',
|
|
... 'Party', 'commands']
|
|
>>> chencherry = SmoothingFunction()
|
|
>>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
|
|
0.4118...
|
|
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS
|
|
0.4118...
|
|
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS
|
|
0.4118...
|
|
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS
|
|
0.4489...
|
|
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS
|
|
0.4118...
|
|
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS
|
|
0.4118...
|
|
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
|
|
0.4905...
|
|
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
|
|
0.4135...
|
|
>>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
|
|
0.4905...
|
|
:param epsilon: the epsilon value use in method 1
|
|
:type epsilon: float
|
|
:param alpha: the alpha value use in method 6
|
|
:type alpha: int
|
|
:param k: the k value use in method 4
|
|
:type k: int
|
|
"""
|
|
self.epsilon = epsilon
|
|
self.alpha = alpha
|
|
self.k = k
|
|
|
|
def method0(self, p_n, *args, **kwargs):
|
|
"""
|
|
No smoothing.
|
|
"""
|
|
p_n_new = []
|
|
for i, p_i in enumerate(p_n):
|
|
if p_i.numerator != 0:
|
|
p_n_new.append(p_i)
|
|
else:
|
|
_msg = str(
|
|
"\nThe hypothesis contains 0 counts of {}-gram overlaps.\n"
|
|
"Therefore the BLEU score evaluates to 0, independently of\n"
|
|
"how many N-gram overlaps of lower order it contains.\n"
|
|
"Consider using lower n-gram order or use "
|
|
"SmoothingFunction()"
|
|
).format(i + 1)
|
|
warnings.warn(_msg)
|
|
|
|
|
|
|
|
|
|
|
|
p_n_new.append(sys.float_info.min)
|
|
return p_n_new
|
|
|
|
def method1(self, p_n, *args, **kwargs):
|
|
"""
|
|
Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
|
|
"""
|
|
return [
|
|
(p_i.numerator + self.epsilon) / p_i.denominator
|
|
if p_i.numerator == 0
|
|
else p_i
|
|
for p_i in p_n
|
|
]
|
|
|
|
def method2(self, p_n, *args, **kwargs):
|
|
"""
|
|
Smoothing method 2: Add 1 to both numerator and denominator from
|
|
Chin-Yew Lin and Franz Josef Och (2004) Automatic evaluation of
|
|
machine translation quality using longest common subsequence and
|
|
skip-bigram statistics. In ACL04.
|
|
"""
|
|
return [
|
|
Fraction(p_i.numerator + 1, p_i.denominator + 1, _normalize=False)
|
|
for p_i in p_n
|
|
]
|
|
|
|
def method3(self, p_n, *args, **kwargs):
|
|
"""
|
|
Smoothing method 3: NIST geometric sequence smoothing
|
|
The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each
|
|
precision score whose matching n-gram count is null.
|
|
k is 1 for the first 'n' value for which the n-gram match count is null/
|
|
For example, if the text contains:
|
|
- one 2-gram match
|
|
- and (consequently) two 1-gram matches
|
|
the n-gram count for each individual precision score would be:
|
|
- n=1 => prec_count = 2 (two unigrams)
|
|
- n=2 => prec_count = 1 (one bigram)
|
|
- n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1)
|
|
- n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
|
|
"""
|
|
incvnt = 1
|
|
for i, p_i in enumerate(p_n):
|
|
if p_i.numerator == 0:
|
|
p_n[i] = 1 / (2 ** incvnt * p_i.denominator)
|
|
incvnt += 1
|
|
return p_n
|
|
|
|
def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
|
|
"""
|
|
Smoothing method 4:
|
|
Shorter translations may have inflated precision values due to having
|
|
smaller denominators; therefore, we give them proportionally
|
|
smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry
|
|
suggests dividing by 1/ln(len(T)), where T is the length of the translation.
|
|
"""
|
|
hyp_len = hyp_len if hyp_len else len(hypothesis)
|
|
for i, p_i in enumerate(p_n):
|
|
if p_i.numerator == 0 and hyp_len != 0:
|
|
incvnt = i + 1 * self.k / math.log(
|
|
hyp_len
|
|
)
|
|
p_n[i] = incvnt / p_i.denominator
|
|
return p_n
|
|
|
|
def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
|
|
"""
|
|
Smoothing method 5:
|
|
The matched counts for similar values of n should be similar. To a
|
|
calculate the n-gram matched count, it averages the n−1, n and n+1 gram
|
|
matched counts.
|
|
"""
|
|
hyp_len = hyp_len if hyp_len else len(hypothesis)
|
|
m = {}
|
|
|
|
p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)]
|
|
m[-1] = p_n[0] + 1
|
|
for i, p_i in enumerate(p_n):
|
|
p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3
|
|
m[i] = p_n[i]
|
|
return p_n
|
|
|
|
def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
|
|
"""
|
|
Smoothing method 6:
|
|
Interpolates the maximum likelihood estimate of the precision *p_n* with
|
|
a prior estimate *pi0*. The prior is estimated by assuming that the ratio
|
|
between pn and pn−1 will be the same as that between pn−1 and pn−2; from
|
|
Gao and He (2013) Training MRF-Based Phrase Translation Models using
|
|
Gradient Ascent. In NAACL.
|
|
"""
|
|
hyp_len = hyp_len if hyp_len else len(hypothesis)
|
|
|
|
|
|
|
|
assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
|
|
for i, p_i in enumerate(p_n):
|
|
if i in [0, 1]:
|
|
continue
|
|
else:
|
|
pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2]
|
|
|
|
m = p_i.numerator
|
|
|
|
l = sum(1 for _ in ngrams(hypothesis, i + 1))
|
|
|
|
p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
|
|
return p_n
|
|
|
|
def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
|
|
"""
|
|
Smoothing method 7:
|
|
Interpolates methods 4 and 5.
|
|
"""
|
|
hyp_len = hyp_len if hyp_len else len(hypothesis)
|
|
p_n = self.method4(p_n, references, hypothesis, hyp_len)
|
|
p_n = self.method5(p_n, references, hypothesis, hyp_len)
|
|
return p_n
|
|
|