|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'''Provides: |
|
cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test(). |
|
cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked(). |
|
''' |
|
|
|
import copy |
|
import sys, math, re |
|
from collections import defaultdict |
|
|
|
def precook(s, n=4, out=False): |
|
"""Takes a string as input and returns an object that can be given to |
|
either cook_refs or cook_test. This is optional: cook_refs and cook_test |
|
can take string arguments as well.""" |
|
words = s.split() |
|
counts = defaultdict(int) |
|
for k in range(1,n+1): |
|
for i in range(len(words)-k+1): |
|
ngram = tuple(words[i:i+k]) |
|
counts[ngram] += 1 |
|
return (len(words), counts) |
|
|
|
def cook_refs(refs, eff=None, n=4): |
|
'''Takes a list of reference sentences for a single segment |
|
and returns an object that encapsulates everything that BLEU |
|
needs to know about them.''' |
|
|
|
reflen = [] |
|
maxcounts = {} |
|
for ref in refs: |
|
rl, counts = precook(ref, n) |
|
reflen.append(rl) |
|
for (ngram,count) in counts.items(): |
|
maxcounts[ngram] = max(maxcounts.get(ngram,0), count) |
|
|
|
|
|
if eff == "shortest": |
|
reflen = min(reflen) |
|
elif eff == "average": |
|
reflen = float(sum(reflen))/len(reflen) |
|
|
|
|
|
|
|
|
|
|
|
return (reflen, maxcounts) |
|
|
|
def cook_test(test, refparam, eff=None, n=4): |
|
'''Takes a test sentence and returns an object that |
|
encapsulates everything that BLEU needs to know about it.''' |
|
|
|
reflen, refmaxcounts = refparam[0], refparam[1] |
|
testlen, counts = precook(test, n, True) |
|
|
|
result = {} |
|
|
|
|
|
|
|
if eff == "closest": |
|
result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1] |
|
else: |
|
result["reflen"] = reflen |
|
|
|
result["testlen"] = testlen |
|
|
|
result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)] |
|
|
|
result['correct'] = [0]*n |
|
for (ngram, count) in counts.items(): |
|
result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count) |
|
|
|
return result |
|
|
|
class BleuScorer(object): |
|
"""Bleu scorer. |
|
""" |
|
|
|
__slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen" |
|
|
|
|
|
def copy(self): |
|
''' copy the refs.''' |
|
new = BleuScorer(n=self.n) |
|
new.ctest = copy.copy(self.ctest) |
|
new.crefs = copy.copy(self.crefs) |
|
new._score = None |
|
return new |
|
|
|
def __init__(self, test=None, refs=None, n=4, special_reflen=None): |
|
''' singular instance ''' |
|
|
|
self.n = n |
|
self.crefs = [] |
|
self.ctest = [] |
|
self.cook_append(test, refs) |
|
self.special_reflen = special_reflen |
|
|
|
def cook_append(self, test, refs): |
|
'''called by constructor and __iadd__ to avoid creating new instances.''' |
|
|
|
if refs is not None: |
|
self.crefs.append(cook_refs(refs)) |
|
if test is not None: |
|
cooked_test = cook_test(test, self.crefs[-1]) |
|
self.ctest.append(cooked_test) |
|
else: |
|
self.ctest.append(None) |
|
|
|
self._score = None |
|
|
|
def ratio(self, option=None): |
|
self.compute_score(option=option) |
|
return self._ratio |
|
|
|
def score_ratio(self, option=None): |
|
'''return (bleu, len_ratio) pair''' |
|
return (self.fscore(option=option), self.ratio(option=option)) |
|
|
|
def score_ratio_str(self, option=None): |
|
return "%.4f (%.2f)" % self.score_ratio(option) |
|
|
|
def reflen(self, option=None): |
|
self.compute_score(option=option) |
|
return self._reflen |
|
|
|
def testlen(self, option=None): |
|
self.compute_score(option=option) |
|
return self._testlen |
|
|
|
def retest(self, new_test): |
|
if type(new_test) is str: |
|
new_test = [new_test] |
|
assert len(new_test) == len(self.crefs), new_test |
|
self.ctest = [] |
|
for t, rs in zip(new_test, self.crefs): |
|
self.ctest.append(cook_test(t, rs)) |
|
self._score = None |
|
|
|
return self |
|
|
|
def rescore(self, new_test): |
|
''' replace test(s) with new test(s), and returns the new score.''' |
|
|
|
return self.retest(new_test).compute_score() |
|
|
|
def size(self): |
|
assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) |
|
return len(self.crefs) |
|
|
|
def __iadd__(self, other): |
|
'''add an instance (e.g., from another sentence).''' |
|
|
|
if type(other) is tuple: |
|
|
|
self.cook_append(other[0], other[1]) |
|
else: |
|
assert self.compatible(other), "incompatible BLEUs." |
|
self.ctest.extend(other.ctest) |
|
self.crefs.extend(other.crefs) |
|
self._score = None |
|
|
|
return self |
|
|
|
def compatible(self, other): |
|
return isinstance(other, BleuScorer) and self.n == other.n |
|
|
|
def single_reflen(self, option="average"): |
|
return self._single_reflen(self.crefs[0][0], option) |
|
|
|
def _single_reflen(self, reflens, option=None, testlen=None): |
|
|
|
if option == "shortest": |
|
reflen = min(reflens) |
|
elif option == "average": |
|
reflen = float(sum(reflens))/len(reflens) |
|
elif option == "closest": |
|
reflen = min((abs(l-testlen), l) for l in reflens)[1] |
|
else: |
|
assert False, "unsupported reflen option %s" % option |
|
|
|
return reflen |
|
|
|
def recompute_score(self, option=None, verbose=0): |
|
self._score = None |
|
return self.compute_score(option, verbose) |
|
|
|
def compute_score(self, option=None, verbose=0): |
|
n = self.n |
|
small = 1e-9 |
|
tiny = 1e-15 |
|
bleu_list = [[] for _ in range(n)] |
|
|
|
if self._score is not None: |
|
return self._score |
|
|
|
if option is None: |
|
option = "average" if len(self.crefs) == 1 else "closest" |
|
|
|
self._testlen = 0 |
|
self._reflen = 0 |
|
totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n} |
|
|
|
|
|
for comps in self.ctest: |
|
testlen = comps['testlen'] |
|
self._testlen += testlen |
|
|
|
if self.special_reflen is None: |
|
reflen = self._single_reflen(comps['reflen'], option, testlen) |
|
else: |
|
reflen = self.special_reflen |
|
|
|
self._reflen += reflen |
|
|
|
for key in ['guess','correct']: |
|
for k in range(n): |
|
totalcomps[key][k] += comps[key][k] |
|
|
|
|
|
bleu = 1. |
|
for k in range(n): |
|
bleu *= (float(comps['correct'][k]) + tiny) \ |
|
/(float(comps['guess'][k]) + small) |
|
bleu_list[k].append(bleu ** (1./(k+1))) |
|
ratio = (testlen + tiny) / (reflen + small) |
|
if ratio < 1: |
|
for k in range(n): |
|
bleu_list[k][-1] *= math.exp(1 - 1/ratio) |
|
|
|
if verbose > 1: |
|
print(comps, reflen) |
|
|
|
totalcomps['reflen'] = self._reflen |
|
totalcomps['testlen'] = self._testlen |
|
|
|
bleus = [] |
|
bleu = 1. |
|
for k in range(n): |
|
bleu *= float(totalcomps['correct'][k] + tiny) \ |
|
/ (totalcomps['guess'][k] + small) |
|
bleus.append(bleu ** (1./(k+1))) |
|
ratio = (self._testlen + tiny) / (self._reflen + small) |
|
if ratio < 1: |
|
for k in range(n): |
|
bleus[k] *= math.exp(1 - 1/ratio) |
|
|
|
if verbose > 0: |
|
print(totalcomps) |
|
print("ratio:", ratio) |
|
|
|
self._score = bleus |
|
return self._score, bleu_list |
|
|