#!/usr/bin/env python # compute Bleu scores with confidence intervals via boostrap resampling # written by Ulrich Germann # # This file is part of moses. Its use is licensed under the GNU Lesser General # Public License version 2.1 or, at your option, any later version. from argparse import ArgumentParser import math import os from random import randint import sys, gzip def count_ngrams(snt, max_n): """ Return a dictionary of ngram counts (up to length /max_n/) for sentence (list of words) /snt/. """ ret = {} for i in xrange(len(snt)): for k in xrange(i + 1, min(i + max_n + 1, len(snt) + 1)): key = tuple(snt[i:k]) ret[key] = ret.get(key, 0) + 1 return ret def max_counts(ng1, ng2): """ Return a dicitonary of ngram counts such that each count is the greater of the two individual counts for each ngram in the input ngram count dictionaries /ng1/ and /ng2/. """ ret = ng1.copy() for k, v in ng2.items(): ret[k] = max(ret.get(k, 0), v) return ret def ng_hits(hyp, ref, max_n): """ Return a list of ngram counts such that each ngram count is the minimum of the counts in hyp and ref, up to ngram length /max_n/. """ ret = [0 for i in xrange(max_n)] for ng, cnt in hyp.items(): k = ng if len(k) <= max_n: ret[len(k) - 1] += min(cnt, ref.get(ng, 0)) return ret class BleuScore: def __init__(self, hyp, ref, max_n=4, bootstrap=1000): # print len(hyp.ngrams), len(ref.ngrams), "X" self.hits = [ ng_hits(hyp.ngrams[i], ref.ngrams[i], max_n) for i in xrange(len(hyp.ngrams))] self.max_n = max_n self.hyp = hyp self.ref = ref self.lower = None self.upper = None self.median = None self.actual = self.score([i for i in xrange(len(hyp.snt))]) if bootstrap: self.bootstrap = [self.score([randint(0, len(hyp.snt) - 1) for s in hyp.snt]) for i in xrange(bootstrap)] self.bootstrap.sort() else: self.bootstrap = [self.actual] pass def score(self, sample): hits = [0 for i in xrange(self.max_n)] self.hyplen = 0 self.reflen = 0 self.total = [0 for i in hits] for i in sample: self.hyplen += len(self.hyp.snt[i]) self.reflen += len(self.ref.snt[i]) for n in xrange(self.max_n): hits[n] += self.hits[i][n] self.total[n] += max(len(self.hyp.snt[i]) - n, 0) pass self.prec = [float(hits[n]) / self.total[n] for n in xrange(self.max_n)] ret = sum([math.log(x) for x in self.prec]) / self.max_n self.BP = min( 1, math.exp(1. - float(self.reflen) / float(self.hyplen))) ret += math.log(self.BP) return math.exp(ret) class Document: def __init__(self, fname=None): self.fname = fname if fname: if fname[-3:] == ".gz": self.snt = [line.strip().split() for line in gzip.open(fname).readlines()] else: self.snt = [line.strip().split() for line in open(fname)] pass self.ngrams = [count_ngrams(snt, 4) for snt in self.snt] # print self.snt else: self.snt = None self.ngrams = None def merge(self, R): self.fname = "multi-ref" self.ngrams = [x for x in R[0].ngrams] self.snt = [x for x in R[0].snt] for i in xrange(len(R[0].ngrams)): for k in xrange(1, len(R)): self.ngrams[i] = max_counts(self.ngrams[i], R[k].ngrams[i]) def update(self, hyp, R): for i, hyp_snt in enumerate(hyp.snt): clen = len(hyp_snt) K = 0 for k in xrange(1, len(R)): k_snt = R[k].snt[i] assert len(R[k].snt) == len(hyp.snt), ( "Mismatch in number of sentences " + "between reference and candidate") if abs(len(k_snt) - clen) == abs(len(R[K].snt[i]) - clen): if len(k_snt) < len(R[K].snt[i]): K = k elif abs(len(k_snt) - clen) < abs(len(R[K].snt[i]) - clen): K = k self.snt[i] = R[K].snt[i] if __name__ == "__main__": argparser = ArgumentParser() argparser.add_argument( "-r", "--ref", nargs='+', help="Reference translation(s).") argparser.add_argument( "-c", "--cand", nargs='+', help="Candidate translations.") argparser.add_argument( "-i", "--individual", action='store_true', help="Compute BLEU scores for individual references.") argparser.add_argument( "-b", "--bootstrap", type=int, default=1000, help="Sample size for bootstrap resampling.") argparser.add_argument( "-a", "--alpha", type=float, default=.05, help="1-alpha = confidence interval.") args = argparser.parse_args(sys.argv[1:]) R = [Document(fname) for fname in args.ref] C = [Document(fname) for fname in args.cand] Rx = Document() # for multi-reference BLEU Rx.merge(R) for c in C: # compute multi-reference BLEU Rx.update(c, R) bleu = BleuScore(c, Rx, bootstrap=args.bootstrap) print "%5.2f %s [%5.2f-%5.2f; %5.2f] %s" % ( 100 * bleu.actual, os.path.basename(Rx.fname), 100 * bleu.bootstrap[int((args.alpha / 2) * args.bootstrap)], 100 * bleu.bootstrap[int((1 - (args.alpha / 2)) * args.bootstrap)], 100 * bleu.bootstrap[int(.5 * args.bootstrap)], c.fname) # os.path.basename(c.fname)) if args.individual: for r in R: bleu = BleuScore(c, r, bootstrap=args.bootstrap) print " %5.2f %s" % ( 100 * bleu.actual, os.path.basename(r.fname)) # print bleu.prec, bleu.hyplen, bleu.reflen, bleu.BP # print [ # sum([bleu.hits[i][n] for i in xrange(len(bleu.hits))]) # for n in xrange(4)]