File size: 6,299 Bytes
158b61b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
#!/usr/bin/env python
# compute Bleu scores with confidence intervals via boostrap resampling
# written by Ulrich Germann
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
from argparse import ArgumentParser
import math
import os
from random import randint
import sys, gzip
def count_ngrams(snt, max_n):
"""
Return a dictionary of ngram counts (up to length /max_n/)
for sentence (list of words) /snt/.
"""
ret = {}
for i in xrange(len(snt)):
for k in xrange(i + 1, min(i + max_n + 1, len(snt) + 1)):
key = tuple(snt[i:k])
ret[key] = ret.get(key, 0) + 1
return ret
def max_counts(ng1, ng2):
"""
Return a dicitonary of ngram counts such that
each count is the greater of the two individual counts
for each ngram in the input ngram count dictionaries
/ng1/ and /ng2/.
"""
ret = ng1.copy()
for k, v in ng2.items():
ret[k] = max(ret.get(k, 0), v)
return ret
def ng_hits(hyp, ref, max_n):
"""
Return a list of ngram counts such that each ngram count
is the minimum of the counts in hyp and ref, up to ngram
length /max_n/.
"""
ret = [0 for i in xrange(max_n)]
for ng, cnt in hyp.items():
k = ng
if len(k) <= max_n:
ret[len(k) - 1] += min(cnt, ref.get(ng, 0))
return ret
class BleuScore:
def __init__(self, hyp, ref, max_n=4, bootstrap=1000):
# print len(hyp.ngrams), len(ref.ngrams), "X"
self.hits = [
ng_hits(hyp.ngrams[i], ref.ngrams[i], max_n)
for i in xrange(len(hyp.ngrams))]
self.max_n = max_n
self.hyp = hyp
self.ref = ref
self.lower = None
self.upper = None
self.median = None
self.actual = self.score([i for i in xrange(len(hyp.snt))])
if bootstrap:
self.bootstrap = [self.score([randint(0, len(hyp.snt) - 1)
for s in hyp.snt])
for i in xrange(bootstrap)]
self.bootstrap.sort()
else:
self.bootstrap = [self.actual]
pass
def score(self, sample):
hits = [0 for i in xrange(self.max_n)]
self.hyplen = 0
self.reflen = 0
self.total = [0 for i in hits]
for i in sample:
self.hyplen += len(self.hyp.snt[i])
self.reflen += len(self.ref.snt[i])
for n in xrange(self.max_n):
hits[n] += self.hits[i][n]
self.total[n] += max(len(self.hyp.snt[i]) - n, 0)
pass
self.prec = [float(hits[n]) / self.total[n]
for n in xrange(self.max_n)]
ret = sum([math.log(x) for x in self.prec]) / self.max_n
self.BP = min(
1, math.exp(1. - float(self.reflen) / float(self.hyplen)))
ret += math.log(self.BP)
return math.exp(ret)
class Document:
def __init__(self, fname=None):
self.fname = fname
if fname:
if fname[-3:] == ".gz":
self.snt = [line.strip().split() for line in gzip.open(fname).readlines()]
else:
self.snt = [line.strip().split() for line in open(fname)]
pass
self.ngrams = [count_ngrams(snt, 4) for snt in self.snt]
# print self.snt
else:
self.snt = None
self.ngrams = None
def merge(self, R):
self.fname = "multi-ref"
self.ngrams = [x for x in R[0].ngrams]
self.snt = [x for x in R[0].snt]
for i in xrange(len(R[0].ngrams)):
for k in xrange(1, len(R)):
self.ngrams[i] = max_counts(self.ngrams[i], R[k].ngrams[i])
def update(self, hyp, R):
for i, hyp_snt in enumerate(hyp.snt):
clen = len(hyp_snt)
K = 0
for k in xrange(1, len(R)):
k_snt = R[k].snt[i]
assert len(R[k].snt) == len(hyp.snt), (
"Mismatch in number of sentences " +
"between reference and candidate")
if abs(len(k_snt) - clen) == abs(len(R[K].snt[i]) - clen):
if len(k_snt) < len(R[K].snt[i]):
K = k
elif abs(len(k_snt) - clen) < abs(len(R[K].snt[i]) - clen):
K = k
self.snt[i] = R[K].snt[i]
if __name__ == "__main__":
argparser = ArgumentParser()
argparser.add_argument(
"-r", "--ref", nargs='+', help="Reference translation(s).")
argparser.add_argument(
"-c", "--cand", nargs='+', help="Candidate translations.")
argparser.add_argument(
"-i", "--individual", action='store_true',
help="Compute BLEU scores for individual references.")
argparser.add_argument(
"-b", "--bootstrap", type=int, default=1000,
help="Sample size for bootstrap resampling.")
argparser.add_argument(
"-a", "--alpha", type=float, default=.05,
help="1-alpha = confidence interval.")
args = argparser.parse_args(sys.argv[1:])
R = [Document(fname) for fname in args.ref]
C = [Document(fname) for fname in args.cand]
Rx = Document() # for multi-reference BLEU
Rx.merge(R)
for c in C:
# compute multi-reference BLEU
Rx.update(c, R)
bleu = BleuScore(c, Rx, bootstrap=args.bootstrap)
print "%5.2f %s [%5.2f-%5.2f; %5.2f] %s" % (
100 * bleu.actual,
os.path.basename(Rx.fname),
100 * bleu.bootstrap[int((args.alpha / 2) * args.bootstrap)],
100 * bleu.bootstrap[int((1 - (args.alpha / 2)) * args.bootstrap)],
100 * bleu.bootstrap[int(.5 * args.bootstrap)],
c.fname) # os.path.basename(c.fname))
if args.individual:
for r in R:
bleu = BleuScore(c, r, bootstrap=args.bootstrap)
print " %5.2f %s" % (
100 * bleu.actual, os.path.basename(r.fname))
# print bleu.prec, bleu.hyplen, bleu.reflen, bleu.BP
# print [
# sum([bleu.hits[i][n] for i in xrange(len(bleu.hits))])
# for n in xrange(4)]
|