File size: 1,218 Bytes
158b61b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
#ifndef __sampling_h
#define __sampling_h
#include <boost/dynamic_bitset.hpp>
#include <vector>
#include "util/random.hh"
// Utility functions for proper sub-sampling.
// (c) 2007-2012 Ulrich Germann
namespace Moses
{
// select a random sample of size /s/ without restitution from the
// range of integers [0,N);
template<typename idx_t>
void
randomSample(std::vector<idx_t>& v, size_t s, size_t N)
{
// see also Knuth: Art of Computer Programming Vol. 2, p. 142
s = std::min(s,N);
v.resize(s);
// the first option tries to be a bit more efficient than O(N) in
// picking the samples. The threshold is an ad-hoc, off-the-cuff
// guess. I still need to figure out the optimal break-even point
// between a linear sweep and repeatedly picking random numbers with
// the risk of hitting the same number many times.
if (s*10<N) {
boost::dynamic_bitset<uint64_t> check(N,0);
for (size_t i = 0; i < v.size(); i++) {
size_t x = util::rand_excl(N);
while (check[x]) x = util::rand_excl(N);
check[x]=true;
v[i] = x;
}
} else {
size_t m=0;
for (size_t t = 0; m <= s && t < N; t++)
if (s==N || util::rand_excl(N-t) < s-m) v[m++] = t;
}
}
};
#endif
|