File size: 1,024 Bytes
158b61b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/env python

#
#
#

import heapq
import math
import random
import sys

from bleu import BleuScorer


class Sample:
  """A pair of hypotheses, and their score difference"""
  def __init__(self,hyp1,hyp2):
    self.hyp1 = hyp1
    self.hyp2 = hyp2
    self.diff = abs(hyp1.score-hyp2.score)

  def __cmp__(self,other):
    return cmp(self.diff,other.diff)

class HopkinsMaySampler:
  """Implements Hopkins & May sampling"""
  def __init__(self):
    self.ncandidates = 5000 # Gamma in Hopkins and May
    self.nsamples = 50 # Xi in Hopkins and May
    self.min_diff = 0.05 # Minimum scoring difference

  def sample(self,nbest):
    samples = []
    for i in xrange(self.ncandidates):
      hyp1 = random.choice(nbest.hyps)
      hyp2 = random.choice(nbest.hyps)
      sample = Sample(hyp1,hyp2)
      if  sample.diff < self.min_diff: continue
      # maintain nsamples biggest samples
      heapq.heappush(samples,sample)
      while len(samples) > self.nsamples:
        heapq.heappop(samples)
    return samples