File size: 1,218 Bytes
158b61b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#ifndef __sampling_h
#define __sampling_h
#include <boost/dynamic_bitset.hpp>
#include <vector>

#include "util/random.hh"

// Utility functions for proper sub-sampling.
// (c) 2007-2012 Ulrich Germann


namespace Moses
{

// select a random sample of size /s/ without restitution from the
// range of integers [0,N);
template<typename idx_t>
void
randomSample(std::vector<idx_t>& v, size_t s, size_t N)
{
  // see also Knuth: Art of Computer Programming Vol. 2, p. 142

  s = std::min(s,N);
  v.resize(s);

  // the first option tries to be a bit more efficient than O(N) in
  // picking the samples. The threshold is an ad-hoc, off-the-cuff
  // guess. I still need to figure out the optimal break-even point
  // between a linear sweep and repeatedly picking random numbers with
  // the risk of hitting the same number many times.
  if (s*10<N) {
    boost::dynamic_bitset<uint64_t> check(N,0);
    for (size_t i = 0; i < v.size(); i++) {
      size_t x = util::rand_excl(N);
      while (check[x]) x = util::rand_excl(N);
      check[x]=true;
      v[i] = x;
    }
  } else {
    size_t m=0;
    for (size_t t = 0; m <= s && t < N; t++)
      if (s==N || util::rand_excl(N-t) < s-m) v[m++] = t;
  }
}

};

#endif